The goal of this project is just to display that we’ve gotten used to working with the data and that we are on track to create your prediction algorithm.
library(knitr, quietly = TRUE)
library(doParallel, quietly = TRUE)
library(stringi, quietly = TRUE)
library(SnowballC, quietly = TRUE)
library(tm, quietly = TRUE)
path1 <- "D:/R/Work/CapStone/training data/final/en_US/en_US.blogs.txt"
path2 <- "D:/R/Work/CapStone/training data/final/en_US/en_US.twitter.txt"
path3 <- "D:/R/Work/CapStone/training data/final/en_US/en_US.news.txt"
conn <- file(path1, open = "rb")
blogs <- readLines(conn, encoding = "UTF-8")
close(conn)
conn <- file(path2, open = "rb")
twitter <- readLines(conn, encoding = "UTF-8")
## Warning in readLines(conn, encoding = "UTF-8"): line 167155 appears to
## contain an embedded nul
## Warning in readLines(conn, encoding = "UTF-8"): line 268547 appears to
## contain an embedded nul
## Warning in readLines(conn, encoding = "UTF-8"): line 1274086 appears to
## contain an embedded nul
## Warning in readLines(conn, encoding = "UTF-8"): line 1759032 appears to
## contain an embedded nul
close(conn)
conn <- file(path3, open = "rb")
news <- readLines(conn, encoding = "UTF-8")
close(conn)
rm(conn)
file_stats <- data.frame(
fileName = c("en_US.blogs",
"en_US.twitter",
"en_US.news"),
fileSize = c(file.info(path1)$size/1024^2,
file.info(path2)$size/1024^2,
file.info(path3)$size/1024^2),
t(rbind(sapply(list(blogs, twitter, news), stri_stats_general),
WordCount = sapply(list(blogs, twitter, news), stri_stats_latex)[4,]))
)
kable(file_stats)
| fileName | fileSize | Lines | LinesNEmpty | Chars | CharsNWhite | WordCount |
|---|---|---|---|---|---|---|
| en_US.blogs | 200.4242 | 899288 | 899288 | 206824382 | 170389539 | 37570839 |
| en_US.twitter | 159.3641 | 2360148 | 2360148 | 162096031 | 134082634 | 30451128 |
| en_US.news | 196.2775 | 1010242 | 1010242 | 203223154 | 169860866 | 34494539 |
We will take only a sample of the data for the analysis as the actual files are huge. WE will remove the special characters, White spaces, punctuations etc from the sample. We will also remove the profanity from the data. We will be using the badword data from the following google link [https://code.google.com/archive/p/badwordslist/downloads].
set.seed(2980)
smpl_Blogs <- blogs[sample(1:length(blogs), 12000, replace=FALSE)]
smpl_Twitter <- twitter[sample(1:length(twitter), 12000, replace=FALSE)]
smpl_News <- news[sample(1:length(news), 12000, replace=FALSE)]
clean_the_data <- function (x)
{
smpl_data <- x
for (i in 1:length(smpl_data))
{
orig_row <- smpl_data[i]
cleaned_row <- iconv(orig_row, "UTF-8", "ASCII", sub = "")
smpl_data[i] <- cleaned_row
}
return(smpl_data)
}
# Clean the sampled data
smpl_Blogs <- clean_the_data(smpl_Blogs)
smpl_Twitter <- clean_the_data(smpl_Twitter)
smpl_News <- clean_the_data(smpl_News)
# Merge the cleaned data and delete the intermediate files
smpl_data <- list(smpl_Blogs, smpl_Twitter, smpl_News)
rm(blogs, news, twitter, path1, path2, path3, path4)
corpus <- list()
dtMatrix <- list()
profanity <- readLines("D:/R/Work/CapStone/training data/final/en_US/badwords.txt", n=457)
removeProfanity <- content_transformer(function(x) {
for(i in 1:length(profanity))
{
x <- gsub(profanity[i],"", x)
}
return(x)
})
for (i in 1 : length(smpl_data))
{
corpus[[i]] <- Corpus(VectorSource(smpl_data[[i]]))
corpus[[i]] <- tm_map(corpus[[i]], tolower)
corpus[[i]] <- tm_map(corpus[[i]], removePunctuation)
corpus[[i]] <- tm_map(corpus[[i]], removeNumbers)
corpus[[i]] <- tm_map(corpus[[i]], stripWhitespace)
corpus[[i]] <- tm_map(corpus[[i]], removeProfanity)
corpus[[i]] <- tm_map(corpus[[i]], removeWords, stopwords("english"))
corpus[[i]] <- tm_map(corpus[[i]], stemDocument)
dtMatrix[[i]] <- DocumentTermMatrix(corpus[[i]],
control=list(wordLengths=c(0,Inf)))
}
rm(smpl_data)
With the help of the wordcloud package we are showing what each corpus looks like. Here is the example for US English Blogs corpus. Other two corpora or the combination of the total three also can be used in the similar way.
library(wordcloud, quietly = TRUE)
library(slam, quietly = TRUE)
# Set random seed for reproducibility
set.seed(2980)
# Set Plotting in 1 row 3 columns
par(mfrow=c(1, 3))
Headings= c("Word Cloud - US English Blogs",
"Word Cloud - US English Twitter",
"Word Cloud - US English News")
# Iterate each corpus and dtMatrix and plot word cloud (Max = 100)
for (i in 1:length(corpus)) {
wordcloud(words = colnames(dtMatrix[[i]]), freq = slam::col_sums(dtMatrix[[i]]),
scale = c(3, 1), max.words = 100, random.order = FALSE, rot.per = 0.45,
use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))
title(Headings[i])
}
We have created plots here which are showing how many times the words are getting repeated in the corpus. The “Unigrams”, “Bigrams” and “Trigrams” are being shown here.
library(dplyr, quietly = TRUE)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
##
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:dplyr':
##
## explain
## Loading required package: qdapTools
##
## Attaching package: 'qdapTools'
## The following object is masked from 'package:dplyr':
##
## id
##
## Attaching package: 'qdap'
## The following object is masked from 'package:dplyr':
##
## %>%
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
##
## ngrams
## The following object is masked from 'package:base':
##
## Filter
library(rJava)#.jinit(parameters="-Xmx128g")
library(RWeka)
library(ggplot2, quietly = TRUE)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:qdapRegex':
##
## %+%
## The following object is masked from 'package:NLP':
##
## annotate
# Define a function to make Unigram, Bigram and Trigram from the corpus
# And then Plot them together with ggplot2 and gridExtra packages
plot.Grams <- function (x=smpl_Blogs, subTitle="Blogs", N=10) {
# Use RWeka to get unigram token
Tokenizer1 <- RWeka::NGramTokenizer(x, Weka_control(min = 1, max = 1))
Gram.1 <- data.frame(table(Tokenizer1))
Gram.1 <- Gram.1[order(Gram.1$Freq, decreasing = TRUE),]
colnames(Gram.1) <- c("Word", "Freq")
Gram.1 <- head(Gram.1, N)
g1 <- ggplot(Gram.1, aes(x=reorder(Word, Freq),y=Freq)) +
geom_bar(stat="identity", fill="green") +
ggtitle(paste("Unigrams", "-", subTitle)) +
xlab("Unigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))
# Use RWeka to get bigram token
Tokenizer2 <- RWeka::NGramTokenizer(x,
Weka_control(min = 2, max = 2,delimiters = "\\r\\n\\t.,;:\"()?!"))
Gram.2 <- data.frame(table(Tokenizer2))
Gram.2 <- Gram.2[order(Gram.2$Freq, decreasing = TRUE),]
colnames(Gram.2) <- c("Word", "Freq")
Gram.2 <- head(Gram.2, N)
g2 <- ggplot(Gram.2, aes(x=reorder(Word, Freq),y=Freq)) +
geom_bar(stat="identity", fill="blue") +
ggtitle(paste("Bigrams", "-", subTitle)) +
xlab("Bigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))
# Use RWeka to get trigram token
Tokenizer3 <- RWeka::NGramTokenizer(smpl_Blogs,
Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))
Gram.3 <- data.frame(table(Tokenizer3))
Gram.3 <- Gram.3[order(Gram.3$Freq, decreasing = TRUE),]
colnames(Gram.3) <- c("Word", "Freq")
Gram.3 <- head(Gram.3, N)
g3 <- ggplot(Gram.3, aes(x=reorder(Word, Freq),y=Freq)) +
geom_bar(stat="identity", fill="darkgreen") +
ggtitle(paste("Trigrams", "-", subTitle)) +
xlab("Trigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))
# Put three plots into 1 row 3 columns
gridExtra::grid.arrange(g1, g2, g3, ncol = 3)
}
plot.Grams(x = smpl_Blogs, subTitle = "Blogs", N = 12)
plot.Grams(x = smpl_Twitter, subTitle = "Twitter", N = 12)
plot.Grams(x = smpl_News, subTitle = "News", N = 12)