This section is for summaries of the three files including word counts, line counts and basic data tables.
library(ngram)
filenames <- paste0('final/en_US/en_US.',c('blogs.txt','news.txt','twitter.txt'))
filesizes <- as.numeric(sapply(filenames, file.info)['size',])
textfiles <- sapply(filenames,readLines)
wordcounts <- sapply(textfiles,wordcount)
linecounts <- sapply(textfiles,length)
charcounts <- sapply(sapply(textfiles,nchar), sum)
data.frame(wordcounts,linecounts,charcounts,filesizes)
## wordcounts linecounts charcounts filesizes
## final/en_US/en_US.blogs.txt 37334131 899288 206824505 210160014
## final/en_US/en_US.news.txt 34372530 1010242 203223159 205811889
## final/en_US/en_US.twitter.txt 30373543 2360148 162096031 167105338
This section is to create a separate sub-sample dataset by reading in a random subset (1%) of the original data and writing it out to a separate file.
sample.filename <- 'final/en_US/sample.txt'
if (!file.exists(sample.filename)) {
set.seed(53)
sampled <- c()
for (i in 1:3){
sampled <- c(sampled,textfiles[[i]][as.logical(rbinom(linecounts[i],1,0.01))])
}
writeLines(sampled,sample.filename)
} else {
sampled <- readLines(sample.filename)
}
rm(textfiles)
This section is to clean the sampled data.
library(tm)
library(dplyr)
badwords<-readLines("http://www.cs.cmu.edu/~biglou/resources/bad-words.txt")
corpus <- Corpus(VectorSource(sampled)) %>%
tm_map(removeNumbers) %>%
tm_map(tolower) %>%
tm_map(removePunctuation) %>%
tm_map(removeWords,stopwords("en")) %>%
tm_map(removeWords,badwords) %>%
tm_map(removeWords,c("\'","\"")) %>%
tm_map(stripWhitespace)
This section is to further explore the dataset to illustrate features of the data.
library(ggplot2)
texts <- data.frame(texts = get('content',corpus))$texts
unigrams <- ngram(paste(texts,collapse = ''),n=1)
df <- head(get.phrasetable(unigrams),n=20)
df$ngrams <- factor(df$ngrams, levels = df$ngrams)
ggplot(data = df, aes(ngrams,freq)) +
geom_bar(stat = "identity") +
theme(text = element_text(size=18)) +
ggtitle("20 most frequent unigrams") + coord_flip()
bigrams <- ngram(paste(texts,collapse = ''),n=2)
df <- head(get.phrasetable(bigrams),n=20)
df$ngrams <- factor(df$ngrams, levels = df$ngrams)
ggplot(data = df, aes(ngrams,freq)) +
geom_bar(stat = "identity") +
theme(text = element_text(size=18)) +
ggtitle("20 most frequent bigrams") + coord_flip()
trigrams <- ngram(paste(texts,collapse = ''),n=3)
df <- head(get.phrasetable(trigrams),n=20)
df$ngrams <- factor(df$ngrams, levels = df$ngrams)
ggplot(data = df, aes(ngrams,freq)) +
geom_bar(stat = "identity") +
theme(text = element_text(size=18)) +
ggtitle("20 most frequent trigrams") + coord_flip()