This report provides the capsulization of the exploratory analysis of the text data, in addition to creating a predictive model
setwd('C:/Users/innugantii/Desktop/Indu/Coursera/Project Final/final/en_US')
file.list = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt")
text <- list(blogs = "", news = "", twitter = "")
data.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("blogs", "news", "twitter"),
c("file size, Mb", "lines", "words")))
for (i in 1:3) {
con <- file(file.list[i], "rb")
text[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
close(con)
data.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2)
data.summary[i,2] <- length(text[[i]])
data.summary[i,3] <- sum(stri_count_words(text[[i]]))
}
data.summary %>%
kable() %>%
kable_styling()
| file size, Mb | lines | words | |
|---|---|---|---|
| blogs | 200.42 | 899288 | 37546246 |
| news | 196.28 | 1010242 | 34762395 |
| 159.36 | 2360148 | 30093410 |
However, these data sets are very large, will proceed with a small sample (0.005%) of the each data set, then combine them into one data set which shall be used for the analysis.
## Create the sample
set.seed(1234)
blogs <- sample(text$blogs, 0.005*length(text$blogs))
news<- sample(text$news, 0.005*length(text$news))
twitter <- sample(text$twitter, 0.005*length(text$twitter))
sample_data <- c(blogs, news, twitter)
sumWords <- sum(stri_count_words(sample_data))
sumWords
## [1] 517755
writeLines(sample_data, "sample_data.txt")
Cleaning the Data
sample_data <- iconv(sample_data, 'UTF-8', 'ASCII')
corpus <- Corpus(VectorSource(as.data.frame(sample_data, stringsAsFactors = FALSE)))
corpus <- corpus %>%
tm_map(removeWords, stopwords("english")) %>% # remove stopwords
tm_map(tolower) %>% # Converts all text to lower case
tm_map(removeNumbers) %>% # Remove Numbers
tm_map(removePunctuation) %>% # Remove punctuation marks
tm_map(stripWhitespace) %>% # Remove whitespaces
tm_map(PlainTextDocument) # An intermediate preprocessing step
Split the sentence into individual words
unigram <- NGramTokenizer(corpus, Weka_control(min = 1, max = 1))
unigram.df <- data.frame(table(unigram))
unigram.df <- unigram.df[order(unigram.df$Freq, decreasing = TRUE),]
wordcloud(unigram.df$unigram, unigram.df$Freq, max.words=125, random.order=TRUE, colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=0, "Word Cloud of Unigram")
Frequency of Unigram
# Plot area set up
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
## Unigram Frequency
plotUni <- ggplot(head(unigram.df,25), aes(reorder(unigram,Freq), Freq)) +
geom_bar(stat="identity",fill = "brown") + coord_flip() +
xlab("Unigrams") + ylab("Frequency") +
ggtitle("Most frequently used words - Unigrams")
ggplotly(plotUni)
bigram <- NGramTokenizer(corpus, Weka_control(min = 2, max = 2))
bigram.df <- data.frame(table(bigram))
bigram.df <- bigram.df[order(bigram.df$Freq, decreasing = TRUE),]
wordcloud(bigram.df$bigram, bigram.df$Freq, max.words=125, random.order=TRUE, colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=0, "Word Cloud of Bigram")
Frequency of Bigram
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
## Bigram Frequency
plotBi <- ggplot(head(bigram.df,25), aes(reorder(bigram,Freq), Freq)) +
geom_bar(stat="identity",fill = "seagreen") + coord_flip() +
xlab("Bigrams") + ylab("Frequency") +
ggtitle("Most frequently used words - Bigrams")
ggplotly(plotBi)
trigram <- NGramTokenizer(corpus, Weka_control(min = 3, max = 3))
trigram.df <- data.frame(table(trigram))
trigram.df <- trigram.df[order(trigram.df$Freq, decreasing = TRUE),]
wordcloud(trigram.df$trigram, trigram.df$Freq, max.words=125, random.order=TRUE, colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=0, "Word Cloud of Trigram")
Frequency of Trigram
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
## Trigram Frequency
plotTri <- ggplot(head(trigram.df,25), aes(reorder(trigram,Freq), Freq)) +
geom_bar(stat="identity",fill = "blue") + coord_flip() +
xlab("Trigrams") + ylab("Frequency") +
ggtitle("Most frequently used words - Trigrams")
ggplotly(plotTri)
quadgram <- NGramTokenizer(corpus, Weka_control(min = 4, max = 4))
quadgram.df <- data.frame(table(quadgram))
quadgram.df <- quadgram.df[order(quadgram.df$Freq, decreasing = TRUE),]
wordcloud(quadgram.df$quadgram, quadgram.df$Freq, max.words=125, random.order=TRUE, colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=0, "Word Cloud of Quadgram")
Frequency of Quadgram
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
## Quadgram Frequency
plotQuad <- ggplot(head(quadgram.df,25), aes(reorder(quadgram,Freq), Freq)) +
geom_bar(stat="identity",fill = "purple") + coord_flip() +
xlab("Quadgrams") + ylab("Frequency") +
ggtitle("Most frequently used words - Quadgrams")
ggplotly(plotQuad)
Predict next word algorithm
(library(tm))
## [1] "ngram" "quanteda" "tokenizers" "slam"
## [5] "devtools" "plotly" "RSQLite" "survey"
## [9] "survival" "Matrix" "grid" "pwr"
## [13] "kableExtra" "knitr" "RWeka" "SnowballC"
## [17] "wordcloud" "RColorBrewer" "tm" "NLP"
## [21] "stringi" "ggplot2" "dplyr" "stats"
## [25] "graphics" "grDevices" "utils" "datasets"
## [29] "methods" "base"
(library(stringr))
## [1] "stringr" "ngram" "quanteda" "tokenizers"
## [5] "slam" "devtools" "plotly" "RSQLite"
## [9] "survey" "survival" "Matrix" "grid"
## [13] "pwr" "kableExtra" "knitr" "RWeka"
## [17] "SnowballC" "wordcloud" "RColorBrewer" "tm"
## [21] "NLP" "stringi" "ggplot2" "dplyr"
## [25] "stats" "graphics" "grDevices" "utils"
## [29] "datasets" "methods" "base"
PredNextTerm <- function(inStr)
{
assign("mesg", "in PredNextTerm", envir = .GlobalEnv)
inStr <- CleanInputString(inStr);
inStr <- unlist(strsplit(inStr, split=" "));
inStrLen <- length(inStr);
nxtTermFound <- FALSE;
predNxtTerm <- as.character(NULL);
if (inStrLen >= 3 & !nxtTermFound)
{
inStr1 <- paste(inStr[(inStrLen-2):inStrLen], collapse=" ");
searchStr <- paste("^",inStr1, sep = "");
fDF4Temp <- fDF4[grep (searchStr, fDF4$terms), ];
if ( length(fDF4Temp[, 1]) > 1 )
{
predNxtTerm <- fDF4Temp[1,1];
nxtTermFound <- TRUE;
mesg <<- " predicting next word using quadgram"
}
fDF4Temp <- NULL;
}
if (inStrLen >= 2 & !nxtTermFound)
{
inStr1 <- paste(inStr[(inStrLen-1):inStrLen], collapse=" ");
searchStr <- paste("^",inStr1, sep = "");
fDF3Temp <- fDF3[grep (searchStr, fDF3$terms), ];
if ( length(fDF3Temp[, 1]) > 1 )
{
predNxtTerm <- fDF3Temp[1,1];
nxtTermFound <- TRUE;
mesg <<- "predicting next word using trigram"
}
fDF3Temp <- NULL;
}
if (inStrLen >= 1 & !nxtTermFound)
{ inStr1 <- inStr[inStrLen];searchStr <- paste("^",inStr1, sep = "");
fDF2Temp <- fDF2[grep (searchStr, fDF2$terms), ];
if ( length(fDF2Temp[, 1]) > 1 )
{
predNxtTerm <- fDF2Temp[1,1];
nxtTermFound <- TRUE;
mesg <<- "predicting next word using bigram.";
}
fDF2Temp <- NULL;
}
if (!nxtTermFound & inStrLen > 0)
{
predNxtTerm <- fDF1$terms[1];
mesg <- "No word found, the most frequently used word is selected as next word."
}
nextTerm <- word(predNxtTerm, -1);
if (inStrLen > 0){
dfTemp1 <- data.frame(nextTerm, mesg);
return(dfTemp1);
} else {
nextTerm <- "";
mesg <-"";
dfTemp1 <- data.frame(nextTerm, mesg);
return(dfTemp1);
}
}