This is a simple paper which briefs out the introductory efforts putforth in-order to build a Shiny-App which accepts a phrase as input and a clever algorithm predicts the subsequent word and returns as output. Word suggestions are given by this shiny-app. Algorithm (or a combination of different machine learning algorithm) is to be selected carefully to produce an efficient result. Text-mining is a vital process which serves as pre-requisite condition in-order to achieve this ambitious task.
rm(list=ls())
library(NLP)
library(tm)
library(R.utils)
library(stringi)
library(RColorBrewer)
library(wordcloud)
library(data.table)
library(ggplot2)
library(SnowballC)
setwd("/Users/Mughundhan/Coursera/final/en_US")
blogs <- readLines("en_US.blogs.txt")
news <- readLines("en_US.news.txt")
twitter <- readLines("en_US.twitter.txt")
twitter_summary <- summary(twitter)
blogs_summary <- summary(blogs)
news_summary <- summary(news)
cbind(twitter_summary, blogs_summary, news_summary)
## twitter_summary blogs_summary news_summary
## Length "2360148" "899288" "1010242"
## Class "character" "character" "character"
## Mode "character" "character" "character"
Twitter_Size <- file.info("en_US.twitter.txt")$size/1024^2
Blogs_Size <- file.info("en_US.blogs.txt")$size/1024^2
News_Size <- file.info("en_US.news.txt")$size/1024^2
cbind(Twitter_Size, Blogs_Size, News_Size)
## Twitter_Size Blogs_Size News_Size
## [1,] 159.3641 200.4242 196.2775
blogs_Lines <- NROW(blogs) #Number of Lines in blogs.txt
news_Lines <- NROW(news) #Number of Lines in news.txt
twitter_Lines <- NROW(twitter) #Number of Lines in twitter.txt
cbind(blogs_Lines, news_Lines, twitter_Lines)
## blogs_Lines news_Lines twitter_Lines
## [1,] 899288 1010242 2360148
We shall infer that the Twitter data-set holds the most number of lines
conB <- file("en_US.blogs.txt","r")
conT <- file("en_US.twitter.txt","r")
conN <- file("en_US.news.txt","r")
lenB <- nchar(readLines(conB))
lenT <- nchar(readLines(conT))
lenN <- nchar(readLines(conN))
Lol <- cbind(lenB, lenT, lenN)
apply(Lol, 2, max, na.rm=TRUE)
## lenB lenT lenN
## 40835 213 11384
close(conB)
close(conT)
close(conN)
love <- grep(".love.", twitter, ignore.case = FALSE)
hate <- grep(".hate.", twitter, ignore.case = FALSE)
biostats <- grep(".biostats.", twitter, ignore.case = FALSE)
trans <- function(x) {
length(x)
}
No_of_love <- trans(love)
No_of_hate <- trans(hate)
No_of_biostats <- trans(biostats)
cbind(No_of_love, No_of_hate, No_of_biostats)
## No_of_love No_of_hate No_of_biostats
## [1,] 86293 21764 1
We shall infer that the word “love” occur more frequently when compared with other two words.
The line containing one among the three words (love, hate, biostats) is returned.
head(twitter[love])
## [1] "The new sundrop commercial ...hehe love at first sight"
## [2] "I love you, and I'm so proud of you. From sitting on those stairs on The X Factor, to now. You boys are my inspiration.\342\231\245 :) xx"
## [3] "\"You will allow me to continue to do what I love.\" Big Show's wife?"
## [4] "Hahahahaha, I love it. Thanks for the quick birthday lesson :)"
## [5] "And thank everyone of you who has taken the time out of your day to listen to us, your all amazing, much love h2p"
## [6] "I love you brianana"
head(twitter[hate])
## [1] "Have a great evening/afternoon/whatever it is where you are, folks!"
## [2] "That's game we lost I can't hate on the Heat Good Game"
## [3] "Dallas slapped me on my red as cherries sunburn, and im still crying aftr 10 mins., i hate him."
## [4] "I hate when i wear bhoops and people are like \" the bigger the hole the bigger the hoe\" and im just like. O___o bitch try me."
## [5] "Just finished \"one thousand gifts\"....what a read. Just ordered 4 new....hate waiting!"
## [6] "I hate missing school"
twitter[biostats]
## [1] "i know how you feel.. i have biostats on tuesday and i have yet to study =/"
phrase <- grep("A computer once beat me at chess, but it was no match for me at kickboxing", twitter)
twitter[phrase] #The lines
## [1] "A computer once beat me at chess, but it was no match for me at kickboxing"
## [2] "A computer once beat me at chess, but it was no match for me at kickboxing"
## [3] "A computer once beat me at chess, but it was no match for me at kickboxing"
length(phrase) #Total No. of lines
## [1] 3
clean_twitter<- iconv(twitter, 'UTF-8', 'ASCII', "byte")
sample_twitter<-sample(clean_twitter, 10000)
docT <- VectorSource(sample_twitter)
corp_twitter <- Corpus(docT)
corp_twitter <- tm_map(corp_twitter, tolower)
#corp_twitter <- tm_map(corp_twitter, removeWords, stopwords("english"))
corp_twitter <- tm_map(corp_twitter, removePunctuation)
corp_twitter <- tm_map(corp_twitter, removeNumbers)
corp_twitter <- tm_map(corp_twitter, stemDocument)
corp_twitter <- tm_map(corp_twitter, stripWhitespace)
corp_twitter <- tm_map(corp_twitter, PlainTextDocument)
dtm <- DocumentTermMatrix(corp_twitter)
dim(dtm)
## [1] 10000 15684
freq_Mat <- as.matrix(dtm)
freq_Sum <- colSums(freq_Mat)
freq_twitter <- sort(freq_Sum, decreasing = TRUE)
#write.csv(freq_Mat, file="dtm.csv") #Creates a CSV file
The top ten words with highest frequencies are displayed in descending order with their corresponding frequencies.
head(freq_twitter,10)
## the you and for that your with have this just
## 3960 2324 1779 1667 954 776 708 701 680 660
wf <- data.frame(word=names(freq_twitter), freq=freq_twitter)
head(wf,10)
## word freq
## the the 3960
## you you 2324
## and and 1779
## for for 1667
## that that 954
## your your 776
## with with 708
## have have 701
## this this 680
## just just 660
We shall infer that the words “just”, “like” and “get” occur more frequently than other words (on ignoring the stop words)
The top ten words with least frequencies are displayed in descending order with their corresponding frequencies.
tail(freq_twitter,10)
## zones zonesalute zoo zow zubrus zucchini
## 1 1 1 1 1 1
## zuckerberg zurich zuzu zygote
## 1 1 1 1
tail(wf,10)
## word freq
## zones zones 1
## zonesalute zonesalute 1
## zoo zoo 1
## zow zow 1
## zubrus zubrus 1
## zucchini zucchini 1
## zuckerberg zuckerberg 1
## zurich zurich 1
## zuzu zuzu 1
## zygote zygote 1
Several words occurs not more than once and they are displayed above (to name a few - eg: zuri“,”zumba" and “zone” occur only once)
Displays all the words which has a frequency of at-least 300.
findFreqTerms(dtm, lowfreq=300)
## [1] "about" "all" "and" "are" "but" "can" "day"
## [8] "dont" "for" "from" "get" "good" "great" "have"
## [15] "how" "its" "just" "know" "like" "love" "new"
## [22] "not" "now" "one" "out" "thanks" "that" "the"
## [29] "this" "was" "what" "when" "will" "with" "you"
## [36] "your"
These words occur pretty frequently and shall be used for plotting and correlation purposes in-order to get better understanding about our data-set.
p <- ggplot(subset(wf, freq_twitter>300), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
set.seed(123)
wordcloud(corp_twitter, max.words = 300, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=rainbow(6))