setwd("C:/Users/apundhir/Desktop/final/en_US")
require(tm)
require(stringr)
require(SnowballC)
require(slam)
require(RWeka)
require(wordcloud)
require(ggplot2)
require(ggthemes)
require(knitr)
#Reading the data files
textblog <- readLines("en_US.blogs.txt")
textnews <- readLines("en_US.news.txt")
texttwitter <- readLines("en_US.twitter.txt")
#Getting Summary of data
summarytextblog <- summary(textblog)
summarytextnews <- summary(textnews)
summarytexttwitter <- summary(texttwitter)
Line_Count <- c(summarytextblog[1], summarytextnews[1], summarytexttwitter[1])
wordblog <- sum(nchar(textblog))
wordnews <- sum(nchar(textnews))
wordtwitter <- sum(nchar(texttwitter))
Word_Count <- c(wordblog, wordnews, wordtwitter)
#Getting maximum word count per record
Max_Words_Blog <- max(nchar(textblog))
Max_Words_News <- max(nchar(textnews))
Max_Words_Twitter <- max(nchar(texttwitter))
Max_Words <- c(Max_Words_Blog, Max_Words_News, Max_Words_Twitter)
Name <- c("Blog", "News", "Twitter")
df <- data.frame(Name, Line_Count, Word_Count, Max_Words)
kable(df)
| Name | Line_Count | Word_Count | Max_Words |
|---|---|---|---|
| Blog | 899288 | 208361438 | 40835 |
| News | 77259 | 15683765 | 5760 |
| 2360148 | 162384825 | 213 |
#Displaying Sample Data
head(textblog, 3)
## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan â<U+0080><U+009C>godsâ<U+0080>."
## [2] "We love you Mr. Brown."
## [3] "Chad has been awesome with the kids and holding down the fort while I work later than usual! The kids have been busy together playing Skylander on the XBox together, after Kyan cashed in his $$$ from his piggy bank. He wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it (he never taps into that thing either, that is how we know he wanted it so bad). We made him count all of his money to make sure that he had enough! It was very cute to watch his reaction when he realized he did! He also does a very good job of letting Lola feel like she is playing too, by letting her switch out the characters! She loves it almost as much as him."
head(textnews,3)
## [1] "He wasn't home alone, apparently."
## [2] "The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s."
## [3] "WSU's plans quickly became a hot topic on local online sites. Though most people applauded plans for the new biomedical center, many deplored the potential loss of the building."
head(texttwitter, 3)
## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."
## [2] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
## [3] "they've decided its more fun if I don't."
set.seed(234)
textcons <- c(textblog, textnews, texttwitter)
rm(textblog, textnews, texttwitter)
index <- as.logical (rbinom (n = length (textcons), size = 1, prob = 0.05))
textcons <- textcons[index]
# remove punctuation
textcons <- gsub("[[:punct:]]", "", textcons)
# remove numbers
textcons <- gsub("[[:digit:]]", "", textcons)
# remove unnecessary spaces
textcons <- gsub("[ \t]{2,}", "", textcons)
textcons <- gsub("^\\s+|\\s+$", "", textcons)
#Removing Line breaks
textcons <- gsub("[\r\n]", "", textcons)
#Removing non graphical characters so that data can be processed by tm package
textcons <- str_replace_all(textcons,"[^[:graph:]]", " ")
# remove retweet entities
textcons <- gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", textcons)
# remove at people
textcons <- gsub("@\\w+", "", textcons)
#Removing all non alpha numeric characters
textcons <- gsub("[^[:alnum:][:blank:]']", "", textcons)
head(textcons, 3)
## [1] "When PC came out with the Love You a Latte Lite Cartridge I knew I had to have it"
## [2] "Whew there is much talk lately about rising fares due to increased fuel surcharges"
## [3] "Dear Heavenly Father I thank You for teaching me to prayer earnest heartfelt and continued prayers As I spend time to pray like this today Your divine power is manifested changes are effected and circumstances in my life are changed in Jesus Christ Name I pray Amen"
#Making data ready using tm package
corpuscons <- Corpus(VectorSource(textcons))
corpuscons <- tm_map(corpuscons, tolower)
corpuscons <- tm_map(corpuscons, removePunctuation)
corpuscons <- tm_map(corpuscons, removeNumbers)
corpuscons <- tm_map(corpuscons, PlainTextDocument)
wordcloud(corpuscons, scale=c(5,0.5), max.words=100, random.order=FALSE,
rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, "Dark2"))
#Tokenizer Functions
UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
QuadgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
unigramtdm <- TermDocumentMatrix(corpuscons, control = list(tokenize = UnigramTokenizer))
unifreq <- row_sums(unigramtdm)
# sort the list by reverse frequency using built-in order function:
unifreq <- unifreq[order(-unifreq)]
head(unifreq, n=50)
## the and you for that with this was have are
## 143657 76113 41184 36997 34788 23029 20826 19852 19359 17895
## but not your all just from out like they what
## 16302 14833 13460 13071 12265 11455 11099 10811 10564 10543
## will one about its can when get time there more
## 10466 10338 9947 9826 9289 9242 8940 8160 7794 7730
## his had some our who good has love how now
## 7619 7491 7394 7349 7281 7258 7254 7051 6952 6939
## her would know day were been new their she them
## 6898 6856 6832 6637 6426 6402 6179 6125 5969 5961
bigramtdm <- TermDocumentMatrix(corpuscons, control = list(tokenize = BigramTokenizer))
bifreq <- row_sums(bigramtdm)
# sort the list by reverse frequency using built-in order function:
bifreq <- bifreq[order(-bifreq)]
head(bifreq, n=30)
## of the in the to the for the on the to be at the i have
## 12347 11963 6613 6593 6298 5803 4410 3769
## and the in a is a i am i was and i it was with the
## 3714 3610 3604 3522 3475 3394 3269 3232
## for a it is if you going to have a is the will be to get
## 3203 3060 3006 2970 2816 2734 2688 2664
## want to from the with a have to that i one of
## 2421 2419 2402 2382 2357 2353
trigramtdm <- TermDocumentMatrix(corpuscons, control = list(tokenize = TrigramTokenizer))
trifreq <- row_sums(trigramtdm)
# sort the list by reverse frequency using built-in order function:
trifreq <- trifreq[order(-trifreq)]
head(trifreq, n=30)
## thanks for the one of the a lot of
## 1135 987 916
## to be a going to be i want to
## 653 622 605
## i have to looking forward to the end of
## 535 531 490
## i have a it was a thank you for
## 488 480 466
## be able to out of the i need to
## 463 453 429
## cant wait to one of my the rest of
## 421 421 408
## you want to some of the is going to
## 396 384 381
## i love you as well as im going to
## 379 378 371
## for the follow i dont know part of the
## 370 363 360
## a couple of there is a to go to
## 359 358 358
quadgramtdm <- TermDocumentMatrix(corpuscons, control = list(tokenize = QuadgramTokenizer))
quadfreq <- row_sums(quadgramtdm)
# sort the list by reverse frequency using built-in order function:
quadfreq <- quadfreq[order(-quadfreq)]
head(quadfreq, n=30)
## thanks for the follow the end of the at the end of
## 287 262 229
## the rest of the for the first time at the same time
## 229 195 178
## is going to be cant wait to see thanks for the rt
## 172 157 151
## in the middle of thank you for the if you want to
## 149 144 136
## thank you so much one of the most going to be a
## 132 131 129
## when it comes to is one of the one of my favorite
## 124 120 120
## to be able to i am going to i dont want to
## 112 109 100
## i cant wait to one of the best i wish i could
## 99 99 88
## i was going to its going to be i would like to
## 87 87 86
## what do you think on the other hand the middle of the
## 86 83 82
df2 <- data.frame(head(names(bifreq), 20), head(bifreq, 20))
names(df2) <- c("Term", "Count")
df3 <- data.frame(head(names(trifreq), 20), head(trifreq, 20))
names(df3) <- c("Term", "Count")
df4 <- data.frame(head(names(quadfreq), 20), head(quadfreq, 20))
names(df4) <- c("Term", "Count")
plot2 <- ggplot(df2, aes(x = Term, y = Count), alpha = 0.5)
plot2 <- plot2 + geom_bar(stat = "identity", width = 0.6, position = "dodge") +
coord_equal() + coord_flip() +
theme_economist() + scale_colour_economist()
plot2 <- plot2 + theme(axis.text.y=element_text(size=12, vjust=0.5),
legend.title = element_blank(),
plot.title = element_text(size=15, face="bold", hjust = 0.5, vjust = 0.1),
axis.title.x = element_text(size=12, vjust=1),
axis.title.y = element_text(size=12, vjust=2)) +
labs(x = "Terms", y = "Number of Occurances", title = "Bigram Occurances")
plot2
plot3 <- ggplot(df3, aes(x = Term, y = Count), alpha = 0.5)
plot3 <- plot3 + geom_bar(stat = "identity", width = 0.6, position = "dodge") +
coord_equal() + coord_flip() +
theme_economist() + scale_colour_economist()
plot3 <- plot3 + theme(axis.text.y=element_text(size=12, vjust=0.5),
legend.title = element_blank(),
plot.title = element_text(size=15, face="bold", hjust = 0.5, vjust = 0.1),
axis.title.x = element_text(size=12, vjust=1),
axis.title.y = element_text(size=12, vjust=2)) +
labs(x = "Terms", y = "Number of Occurances", title = "Trigram Occurances")
plot3
plot4 <- ggplot(df4, aes(x = Term, y = Count), alpha = 0.5)
plot4 <- plot4 + geom_bar(stat = "identity", width = 0.6, position = "dodge") +
coord_equal() + coord_flip() +
theme_economist() + scale_colour_economist()
plot4 <- plot4 + theme(axis.text.y=element_text(size=12, vjust=0.5),
legend.title = element_blank(),
plot.title = element_text(size=15, face="bold", hjust = 0.5, vjust = 0.1),
axis.title.x = element_text(size=12, vjust=1),
axis.title.y = element_text(size=12, vjust=2)) +
labs(x = "Terms", y = "Number of Occurances", title = "Quadgram Occurances")
plot4
totalword <- sum(unifreq)
counter <- 0
halfcoverageindex <- 0
maxcoverageindex <- 0
for (i in 1:length(unifreq)){
counter <- counter + unifreq[i]
if (counter >= totalword/2){
halfcoverageindex <- i
break
}
}
unifreq[1:halfcoverageindex]
## the and you for that with
## 143657 76113 41184 36997 34788 23029
## this was have are but not
## 20826 19852 19359 17895 16302 14833
## your all just from out like
## 13460 13071 12265 11455 11099 10811
## they what will one about its
## 10564 10543 10466 10338 9947 9826
## can when get time there more
## 9289 9242 8940 8160 7794 7730
## his had some our who good
## 7619 7491 7394 7349 7281 7258
## has love how now her would
## 7254 7051 6952 6939 6898 6856
## know day were been new their
## 6832 6637 6426 6402 6179 6125
## she them see dont back people
## 5969 5961 5785 5749 5388 5341
## think great make going then really
## 5194 5160 4853 4728 4658 4629
## too well much today thanks into
## 4578 4559 4511 4460 4385 4364
## only got which first want way
## 4299 4284 4261 4226 4136 4118
## did because right need here than
## 4058 4042 3996 3963 3955 3914
## work over still him other very
## 3871 3816 3799 3794 3766 3740
## even last said could also life
## 3656 3639 3626 3590 3564 3514
## off after where any cant little
## 3380 3361 3358 3328 3244 3226
## should come never say down why
## 3217 3194 3171 3169 3138 3134
## being lol these two most take
## 3130 3113 3104 3066 3037 3011
## many happy night best year those
## 2987 2976 2953 2913 2855 2844
## things something always before made next
## 2814 2813 2798 2797 2744 2715
## week home while thats follow look
## 2710 2544 2526 2505 2499 2487
## through feel world better sure hope
## 2443 2432 2402 2392 2367 2363
## every may show long again thank
## 2337 2332 2312 2300 2288 2251
## ever man youre thing around another
## 2248 2236 2232 2231 2225 2219
## tonight find ill ive same getting
## 2202 2136 2136 2132 2115 2104
## big few let does school keep
## 2089 2088 2066 2061 2024 1996
## use looking book game own doing
## 1991 1979 1953 1933 1932 1927
## help everyone fun old morning yes
## 1925 1901 1897 1896 1886 1872
## god itâs such give please years
## 1868 1864 1860 1857 1836 1819
## didnt thought put though since lot
## 1801 1787 1774 1764 1759 1741
## until someone away friends place wait
## 1732 1720 1718 1713 1694 1671
## tell having bad pretty done end
## 1665 1663 1647 1640 1635 1630
## used family both nice might each
## 1621 1620 1612 1608 1605 1592
## live days start tomorrow free read
## 1592 1585 1579 1576 1564 1562
## part twitter house weekend music real
## 1555 1555 1541 1529 1512 1498
## went hard yet without must coming
## 1497 1488 1470 1464 1453 1426
## donât enough try iâm soon working
## 1422 1418 1417 1415 1412 1410
## actually already awesome found story makes
## 1406 1406 1389 1386 1382 1381
## call stop check making play nothing
## 1380 1374 1365 1362 1362 1350
## everything left food trying haha myself
## 1342 1336 1322 1296 1291 1291
## once hey kids maybe girl anyone
## 1280 1278 1277 1273 1270 1255
## anything person guys miss ready post
## 1255 1252 1248 1247 1247 1241
## three amazing came bit name wish
## 1241 1240 1232 1223 1215 1200
## watching far talk money told yeah
## 1199 1196 1193 1190 1190 1190
## believe friend blog watch between different
## 1189 1185 1179 1174 1173 1173
## true together head team during kind
## 1173 1164 1163 1162 1152 1152
## beautiful hate room
## 1149 1148 1146
maxcoverageindex <- 0
counter <- 0
for (i in 1:length(unifreq)){
counter <- counter + unifreq[i]
if (counter >= 0.9 * totalword){
maxcoverageindex <- i
break
}
}
The model will use naive base algorithm and to match a particular pattern first it will check total number of words.
If the total number of words available are 3 it will first try to find closest 4 Gram and if a match is found it will predict the 4th word. If no match it will try to check 2 words in 3 gram and will carry on.
In case no match is found we can consider a possibility to providing user with an input and add a record in the corpus for beter future predictions.
Also I may have to look what coverage performs best in terms of accuracy and performance trade off. That is an area which will require considerable amount of work.