The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm.
This document outlines the steps completed to analyze the data and how to prepare (clean and organize) it to use it to train predective models for the algorithm development stage.
Download Dataset SwiftKey path en_US
library(tm)
library(NLP)
library(ngram)
library(RColorBrewer)
library(wordcloud2)
library(reshape2)
library(ggplot2)
library(igraph)
library(RWeka)
library(qdap)
file.info("en_US.blogs.txt")
## size isdir mode mtime ctime
## en_US.blogs.txt 210160014 FALSE 666 2014-07-22 04:43:05 2021-01-27 00:30:55
## atime exe
## en_US.blogs.txt 2021-02-03 19:28:29 no
length(readLines("en_US.blogs.txt", encoding = "UTF-8")) # ncol
## [1] 899288
substr(readLines("en_US.blogs.txt", n = 5, encoding = "UTF-8"), 1,50)
## [1] "In the years thereafter, most of the Oil fields an"
## [2] "We love you Mr. Brown."
## [3] "Chad has been awesome with the kids and holding do"
## [4] "so anyways, i am going to share some home decor in"
## [5] "With graduation season right around the corner, Na"
file.info("en_US.news.txt")
## size isdir mode mtime ctime
## en_US.news.txt 205811889 FALSE 666 2021-01-23 21:10:34 2021-01-27 00:30:56
## atime exe
## en_US.news.txt 2021-02-03 19:28:29 no
length(readLines("en_US.news.txt", encoding = "UTF-8")) # ncol
## [1] 1010242
substr(readLines("en_US.news.txt", n = 5, encoding = "UTF-8"), 1,50)
## [1] "He wasn't home alone, apparently."
## [2] "The St. Louis plant had to close. It would die of "
## [3] "WSU's plans quickly became a hot topic on local on"
## [4] "The Alaimo Group of Mount Holly was up for a contr"
## [5] "And when it's often difficult to predict a law's i"
file.info("en_US.twitter.txt")
## size isdir mode mtime ctime
## en_US.twitter.txt 167104940 FALSE 666 2021-01-24 20:45:33 2021-01-27 00:30:56
## atime exe
## en_US.twitter.txt 2021-02-03 19:28:29 no
length(readLines("en_US.blogs.txt", encoding = "UTF-8")) # ncol
## [1] 899288
substr(readLines("en_US.blogs.txt", n = 5, encoding = "UTF-8"), 1,50)
## [1] "In the years thereafter, most of the Oil fields an"
## [2] "We love you Mr. Brown."
## [3] "Chad has been awesome with the kids and holding do"
## [4] "so anyways, i am going to share some home decor in"
## [5] "With graduation season right around the corner, Na"
Sample was 2000 due to CPU limitations, NPL consumes a lot of CPU.
First we read the data corpus and perform the preprocessings:
perform encoding to read ASCII character.
dataset_1 = readLines("en_US.blogs.txt",encoding="UTF-8", n = 2000)
dataset_2 = readLines("en_US.news.txt",encoding="UTF-8", n = 2000)
dataset_3 = readLines("en_US.twitter.txt",encoding="UTF-8", n = 2000)
dataset = paste(dataset_1,dataset_2,dataset_3)
convert corpus upper case to lower case for able to perform case insensetive
matching.
removing numbers to avoid unnecessary prediction result.
remove white space
remove punctuation
dataset = sent_detect(dataset, language = "en", model = NULL)
body = VCorpus(VectorSource(dataset)) # Building the main body
body = tm_map( body, removeNumbers) # removing numbers
body = tm_map( body, stripWhitespace) # removing whitespaces
body = tm_map( body, tolower) #lowercasing all contents
as.character(body[[1]])
## [1] "in the years thereafter, most of the oil fields and platforms were named after pagan “gods”."
body = tm_map( body, removePunctuation) # removing special characters
bad_word = VectorSource(readLines("bad.txt"))
body = tm_map( body, removeWords, bad_word)
body = tm_map( body, removeWords, stopwords("english"))
x = data.frame(body)
term_doc =as.TermDocumentMatrix( x$text, x$doc_id)
term_matrix = as.matrix(term_doc)
words = sort(rowSums( term_matrix) , decreasing=TRUE )
df = data.frame(word = names(words),freq=words)
wordcloud2(data=df, size=1.6, color='random-dark', shape = "cardioid")
A heat map is a graphical representation of data where the individual values contained in a matrix are represented as colors.
term_rm = removeSparseTerms(term_doc, 0.99)
term_dense = as.matrix(term_rm)
term_dense = melt(term_dense, value.name = "count")
x = term_dense[1:500,]
ggplot(x , aes(x = Docs, y = Terms, fill = log10(count))) +
geom_tile(colour = "white") +
scale_fill_gradient(high="#FF0000" , low="#FFFFFF")+
ylab("") +
theme(panel.background = element_blank()) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
Plot a igraph that allows interactive drawing of networks.
term_rm = removeSparseTerms(term_doc, 0.99)
term_dense = as.matrix(term_rm)
term_dense = melt(term_dense, value.name = "count")
term_dense = subset(term_dense, count> 100 , c("Terms","count"))
term_graph = graph_from_data_frame(term_dense)
plot.igraph( term_graph , layout = layout.circle, edge.arrow.size=0.5,
vertex.color="gold",vertex.size=15, vertex.frame.color="gray",
vertex.label.color="black", vertex.label.cex=0.8,
vertex.label.dist=2, edge.curved=0.2 )
Word frequency over 100
findFreqTerms(term_doc, lowfreq = 100)
## [1] "also" "back" "can" "day" "dont" "even" "first" "get"
## [9] "go" "going" "good" "got" "great" "home" "im" "just"
## [17] "know" "last" "like" "little" "love" "make" "many" "much"
## [25] "new" "now" "one" "people" "really" "right" "said" "say"
## [33] "see" "still" "things" "think" "time" "two" "us" "want"
## [41] "way" "well" "will" "work" "year"
The end goal of this application is to make a word prediction of the next word using the text that has been entered.
Some words are more frequent than others, here we are trying to analyze the distributions of word frequencies. The same is being done for the 2-grams and 3-grams in the dataset. Distribution of Word Frequencies - Single Word, Two words and Tri-words combinations: Preparing the data in correct format by transforming the n-grams to dataframes and ordering by Frequency for charting.
body_2<-gsub("http\\w+", "", body)
token_1 <- NGramTokenizer(body_2, Weka_control(min = 1, max = 1))
df <- data.frame(table(token_1))
token_1 <- df[order(df$Freq,decreasing = TRUE),]
word_fred <- token_1[1:20,]
colnames(word_fred) <- c("Word","Frequency")
ggplot(word_fred, aes( x = Word, y = Frequency), ) +
geom_bar(stat="Identity", fill="blue") +
geom_text(aes(label=Frequency), vjust=-0.2) +
theme(axis.text.x = element_text(angle = 90))
token_2 <- NGramTokenizer(body_2, Weka_control(min = 2, max = 2))
df <- data.frame(table(token_2))
token_2 <- df[order(df$Freq,decreasing = TRUE),]
word_fred <- token_2[1:20,]
colnames(word_fred) <- c("Word","Frequency")
ggplot(word_fred, aes( x = Word, y = Frequency), ) +
geom_bar(stat="Identity", fill="blue") +
geom_text(aes(label=Frequency), vjust=-0.2) +
theme(axis.text.x = element_text(angle = 90))
token_3 <- NGramTokenizer(body_2, Weka_control(min = 3, max = 3))
df <- data.frame(table(token_3))
token_3 <- df[order(df$Freq,decreasing = TRUE),]
word_fred <- token_2[1:20,]
colnames(word_fred) <- c("Word","Frequency")
ggplot(word_fred, aes( x = Word, y = Frequency), ) +
geom_bar(stat="Identity", fill="blue") +
geom_text(aes(label=Frequency), vjust=-0.2) +
theme(axis.text.x = element_text(angle = 90))
These are the analyzes that were carried out during week 2 of the “Data Science Capstone Project” course.