Sys.setenv(JAVA_HOME = ‘C:\Program Files\Java\jre1.8.0_261\’) install.packages(“readtext”) install.packages(“tm”) install.packages(“wordcloud”) # word-cloud generator install.packages(“RColorBrewer”) install.packages(“RWeka”) install.packages(“dplyr”) install.packages(“SnowballC”) install.packages(“stringi”) install.packages(“kableExtra”)
library("SnowballC")
library(RWeka)
library("readtext")
library(tm)
## Loading required package: NLP
library(RColorBrewer)
library(wordcloud)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringi)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
set.seed(4321)
blogs <- readLines("en_US.blogs.txt")
blogs_sample <- sample(blogs, length(blogs) * 0.002)
twit <- readLines("en_US.twitter.txt")
## Warning in readLines("en_US.twitter.txt"): line 167155 appears to contain an
## embedded nul
## Warning in readLines("en_US.twitter.txt"): line 268547 appears to contain an
## embedded nul
## Warning in readLines("en_US.twitter.txt"): line 1274086 appears to contain an
## embedded nul
## Warning in readLines("en_US.twitter.txt"): line 1759032 appears to contain an
## embedded nul
twit_sample <- sample(twit, length(twit)*.002)
news <- readLines("en_US.news.txt")
## Warning in readLines("en_US.news.txt"): incomplete final line found on
## 'en_US.news.txt'
news_sample <- sample(news, length(news) * 0.002)
all <- c(blogs_sample, twit_sample, news_sample)
all_sample <- sample(all, length(all)*0.002)
b <- file.info("en_US.blogs.txt")$size
t <- file.info("en_US.twitter.txt")$size
n <- file.info("en_US.news.txt")$size
b <- round(b/1024/1024)
t <- round(t/1024/1024)
n <- round(n/1024/1024)
a = b + t + n
lb <- length(blogs)
lt <- length(twit)
ln <- length(news)
la <- lb + lt + ln
wb <- sum(stri_count_words(blogs))
wt <- sum(stri_count_words(twit))
wn <- sum(stri_count_words(news))
wa <- wb + wt + wn
df <- data.frame("Size_in_Mb" = c(b,t,n,a), "Number_of_Lines" = c(lb,lt,ln,la), "Number_of_Words" = c(wb, wt, wn, wa))
row.names(df) <- c("Blogs", "Twitter", "News", "All")
kable(df, format = "html",
caption = "Table Size, # of Lines, # of Words for the 4 Datasets",
digits = c(0, 0, 2, 3), align = "ccrr") %>%
kable_styling(full_width = FALSE, position = "left")
| Size_in_Mb | Number_of_Lines | Number_of_Words | |
|---|---|---|---|
| Blogs | 200 | 899288 | 38154238 |
| 159 | 2360148 | 30218125 | |
| News | 196 | 77259 | 2693898 |
| All | 555 | 3336695 | 71066261 |
all_sample_corpus <- VCorpus(VectorSource(all))
all_sample_corpus <- tm_map(all_sample_corpus, stripWhitespace) # Eliminate extra white spaces
all_sample_corpus <- tm_map(all_sample_corpus, removePunctuation) # Remove punctuations
all_sample_corpus <- tm_map(all_sample_corpus, content_transformer(tolower)) # Convert the text to lower case
all_sample_corpus <- tm_map(all_sample_corpus, removeNumbers) # Remove numbers
corpus_sample <- TermDocumentMatrix(all_sample_corpus)
corpus_sample_matrix <- as.matrix(corpus_sample)
corpus_sample_matrix <- sort(rowSums(corpus_sample_matrix),decreasing=TRUE)
corpus_df <- data.frame(word = names(corpus_sample_matrix),freq=corpus_sample_matrix)
head(corpus_df, 10)
## word freq
## the the 5821
## and and 3145
## you you 1726
## for for 1562
## that that 1429
## with with 951
## this this 842
## was was 825
## have have 799
## are are 728
p<-cumsum(corpus_df$freq)/sum(corpus_df$freq)
which(p>=0.5)[1]
## [1] 263
which(p>=0.90)[1]
## [1] 7050
which(p>=0.99)[1]
## [1] 16579
process_corpus<-function(corpus) {
corpus <- VCorpus(VectorSource(corpus))
corpus <- corpus %>%
tm_map(removeNumbers) %>% # Eliminate numbers
tm_map(stripWhitespace) %>% # Eliminate extra white spaces
tm_map(removePunctuation,preserve_intra_word_dashes = TRUE) %>% # Remove punctuations
tm_map(content_transformer(tolower)) ## %>% # Convert the text to lower case
## tm_map(removeWords, stopwords("en")) #Remove stopwords
}
ngrams<-function(corpus, n, N=NA){
corpus<-process_corpus(corpus)
token<-function(corpus){
NGramTokenizer(corpus, Weka_control(min = n, max = n))
}
tdm <- TermDocumentMatrix(corpus, control = list(tokenize = token))
rsum <- rowSums(as.matrix(tdm))
ngram<-data.frame(ngram=names(rsum),freq=rsum)
ngram<-ngram[order(-ngram$freq),]
if(!is.na(N)){
ngram<-head(ngram,N)
}
return(ngram)
}
## Plot Unigrams for all datasets
blogs_df <- ngrams(blogs_sample, 1, 50)
twit_df <- ngrams(twit_sample, 1, 50)
news_df <- ngrams(news_sample, 1, 50)
all_df <- ngrams(all_sample, 1, 50)
par(mfrow=c(2, 2))
barplot(blogs_df[1:10,]$freq, las = 3, names.arg = blogs_df[1:10,]$ngram,
col ="red2", main ="Frequency of words in blogs",
ylab = "Number of Occurrences")
barplot(twit_df[1:10,]$freq, las = 3, names.arg = twit_df[1:10,]$ngram,
col ="chartreuse1", main ="Frequency of words in twitter",
ylab = "Number of Occurrences")
barplot(news_df[1:10,]$freq, las = 3, names.arg = news_df[1:10,]$ngram,
col ="blue3", main ="Frequency of words in the news",
ylab = "Number of Occurrences")
barplot(all_df[1:10,]$freq, las = 3, names.arg = all_df[1:10,]$ngram,
col ="chocolate2", main ="Frequency of words in all the Data",
ylab = "Number of Occurrences")
par(mfrow=c(1,1))
wordcloud(words = blogs_df$ngram, freq = blogs_df$freq, min.freq = 1,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Set1"))
wordcloud(words = twit_df$ngram, freq = twit_df$freq, min.freq = 1,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Spectral"))
wordcloud(words = news_df$ngram, freq = news_df$freq, min.freq = 1,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "RdYlGn"))
wordcloud(words = all_df$ngram, freq = all_df$freq, min.freq = 1,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Paired"))
blogs_df <- ngrams(blogs_sample, 2, 50)
twit_df <- ngrams(twit_sample, 2, 50)
news_df <- ngrams(news_sample, 2, 50)
all_df <- ngrams(all_sample, 2, 50)
par(mfrow=c(2, 2))
barplot(blogs_df[1:10,]$freq, las = 3, names.arg = blogs_df[1:10,]$ngram,
col ="red2", main ="Frequency of words in blogs",
ylab = "Number of Occurrences")
barplot(twit_df[1:10,]$freq, las = 3, names.arg = twit_df[1:10,]$ngram,
col ="chartreuse1", main ="Frequency of words in twitter",
ylab = "Number of Occurrences")
barplot(news_df[1:10,]$freq, las = 3, names.arg = news_df[1:10,]$ngram,
col ="blue3", main ="Frequency of words in the news",
ylab = "Number of Occurrences")
barplot(all_df[1:10,]$freq, las = 3, names.arg = all_df[1:10,]$ngram,
col ="chocolate2", main ="Frequency of words in all the Data",
ylab = "Number of Occurrences")
par(mfrow=c(1,1))
wordcloud(words = blogs_df$ngram, freq = blogs_df$freq, min.freq = 1,
max.words=50, scale=c(1,1), random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Set1"))
wordcloud(words = twit_df$ngram, freq = twit_df$freq, min.freq = 1,
max.words=50, scale=c(1,1), random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Spectral"))
wordcloud(words = news_df$ngram, freq = news_df$freq, min.freq = 1,
max.words=50, scale=c(1,1), random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "RdYlGn"))
wordcloud(words = all_df$ngram, scale=c(1,1), freq = all_df$freq, min.freq = 1,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Paired"))
blogs_df <- ngrams(blogs_sample, 3, 50)
twit_df <- ngrams(twit_sample, 3, 50)
news_df <- ngrams(news_sample, 3, 50)
all_df <- ngrams(all_sample, 3, 100)
par(mfrow=c(2, 2))
barplot(blogs_df[1:10,]$freq, las = 3, names.arg = blogs_df[1:10,]$ngram,
col ="red2", main ="Frequency of words in blogs",
ylab = "Number of Occurrences")
barplot(twit_df[1:10,]$freq, las = 3, names.arg = twit_df[1:10,]$ngram,
col ="chartreuse1", main ="Frequency of words in twitter",
ylab = "Number of Occurrences")
barplot(news_df[1:10,]$freq, las = 3, names.arg = news_df[1:10,]$ngram,
col ="blue3", main ="Frequency of words in the news",
ylab = "Number of Occurrences")
barplot(all_df[1:10,]$freq, las = 3, names.arg = all_df[1:10,]$ngram,
col ="chocolate2", main ="Frequency of words in all the Data",
ylab = "Number of Occurrences")
par(mfrow=c(1,1))
wordcloud(words = blogs_df$ngram, freq = blogs_df$freq, min.freq = 1,
max.words=50, scale=c(1,1), random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Set1"))
wordcloud(words = twit_df$ngram, freq = twit_df$freq, min.freq = 1,
max.words=50, scale=c(1,1), random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Spectral"))
wordcloud(words = news_df$ngram, freq = news_df$freq, min.freq = 1,
max.words=50, scale=c(1,1), random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "RdYlGn"))
wordcloud(words = all_df$ngram, freq = all_df$freq, min.freq = 3,
max.words=100, scale=c(.5,.5), random.order=FALSE, rot.per=.35,
colors=brewer.pal(8, "Paired"))
Future Considerations 1. The datasets are still very large (especially the “all” set). Maybe we will need a smaller sample in the future.
2. Should profanities be removed? I decided not to remove these because
- none of them seemed to be part of the unigrams, bigrams or trigrams
- they might be useful in prediction.
3. Foreign words also were not present. In the future we might need an English dictionary to eliminate foreign words.
4. Build a Training set, Test set to use for Cross-Validation.
5. Build a prediction model to predict the next word, based on n-grams.