blogs <- readLines("en_US.blogs.txt")
news <- readLines("en_US.news.txt")
twitter <- readLines("en_US.twitter.txt", skipNul = TRUE)
print(head(blogs))
## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan “gods”."
## [2] "We love you Mr. Brown."
## [3] "Chad has been awesome with the kids and holding down the fort while I work later than usual! The kids have been busy together playing Skylander on the XBox together, after Kyan cashed in his $$$ from his piggy bank. He wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it (he never taps into that thing either, that is how we know he wanted it so bad). We made him count all of his money to make sure that he had enough! It was very cute to watch his reaction when he realized he did! He also does a very good job of letting Lola feel like she is playing too, by letting her switch out the characters! She loves it almost as much as him."
## [4] "so anyways, i am going to share some home decor inspiration that i have been storing in my folder on the puter. i have all these amazing images stored away ready to come to life when we get our home."
## [5] "With graduation season right around the corner, Nancy has whipped up a fun set to help you out with not only your graduation cards and gifts, but any occasion that brings on a change in one's life. I stamped the images in Memento Tuxedo Black and cut them out with circle Nestabilities. I embossed the kraft and red cardstock with TE's new Stars Impressions Plate, which is double sided and gives you 2 fantastic patterns. You can see how to use the Impressions Plates in this tutorial Taylor created. Just one pass through your die cut machine using the Embossing Pad Kit is all you need to do - super easy!"
## [6] "If you have an alternative argument, let's hear it! :)"
blogs <- gsub("\\.", "", blogs)
blogs <- gsub("\\,", "", blogs)
blogs <- gsub("\\'", "", blogs)
blogs <- gsub("\\@", "", blogs)
blogs <- gsub("\\?", "", blogs)
blogs <- gsub("\\!", "", blogs)
blogs <- gsub("\\:", "", blogs)
blogs <- gsub("\\;", "", blogs)
blogs <- gsub("\\)", "", blogs)
blogs <- gsub("\\(", "", blogs)
blogs <- gsub("\\$", "", blogs)
blogs <- gsub("\\-", "", blogs)
blogs <- gsub("\\#", "", blogs)
blogs <- tolower(blogs)
USblogs <- strsplit(blogs, " ")
news <- gsub("\\.", "", news)
news <- gsub("\\,", "", news)
news <- gsub("\\'", "", news)
news <- gsub("\\@", "", news)
news <- gsub("\\?", "", news)
news <- gsub("\\!", "", news)
news <- gsub("\\:", "", news)
news <- gsub("\\;", "", news)
news <- gsub("\\)", "", news)
news <- gsub("\\(", "", news)
news <- gsub("\\-", "", news)
news <- gsub("\\$", "", news)
news <- gsub("\\#", "", news)
news <- tolower(news)
USnews <- strsplit(news, " ")
twitter <- gsub("\\.", "", twitter)
twitter <- gsub("\\,", "", twitter)
twitter <- gsub("\\'", "", twitter)
twitter <- gsub("\\@", "", twitter)
twitter <- gsub("\\?", "", twitter)
twitter <- gsub("\\!", "", twitter)
twitter <- gsub("\\:", "", twitter)
twitter <- gsub("\\;", "", twitter)
twitter <- gsub("\\)", "", twitter)
twitter <- gsub("\\(", "", twitter)
twitter <- gsub("\\$", "", twitter)
twitter <- gsub("\\-", "", twitter)
twitter <- gsub("\\#", "", twitter)
twitter <- tolower(twitter)
UStwitter <- strsplit(twitter, " ")
print(head(UStwitter))
## [[1]]
## [1] "how" "are" "you" "btw" "thanks" "for" "the"
## [8] "rt" "you" "gonna" "be" "in" "dc" "anytime"
## [15] "soon" "love" "to" "see" "you" "been" "way"
## [22] "way" "too" "long"
##
## [[2]]
## [1] "when" "you" "meet" "someone" "special" "youll" "know"
## [8] "your" "heart" "will" "beat" "more" "rapidly" "and"
## [15] "youll" "smile" "for" "no" "reason"
##
## [[3]]
## [1] "theyve" "decided" "its" "more" "fun" "if" "i"
## [8] "dont"
##
## [[4]]
## [1] "so" "tired" "d" "played" "lazer" "tag" "&"
## [8] "ran" "a" "lot" "d" "ughh" "going" "to"
## [15] "sleep" "like" "in" "5" "minutes"
##
## [[5]]
## [1] "words" "from" "a" "complete" "stranger" "made"
## [7] "my" "birthday" "even" "better"
##
## [[6]]
## [1] "first" "cubs" "game" "ever" "wrigley" "field"
## [7] "is" "gorgeous" "this" "is" "perfect" "go"
## [13] "cubs" "go"
LUStwitter <- unlist(UStwitter)
LUSnews <- unlist(USnews)
LUSblogs <- unlist(USblogs)
a <- c(LUSblogs, LUSnews, LUStwitter)
print(length(a))
## [1] 101902277
tb <- table(a)
tb <- sort(tb, decreasing = TRUE)
print("Most Common")
## [1] "Most Common"
print(head(tb, 50))
## a
## the to and a of in i for is
## 4715745 2748637 2397666 2366710 2003693 1637295 1585368 1094776 1069257
## that you it on with was my at be
## 1029769 917391 883542 813625 711784 622710 597806 566700 545439
## this have are as but he not we
## 529864 528273 488017 479008 470763 463792 419689 405024 390920
## from so me all they will by or said
## 382676 375423 358473 325474 314026 313814 312871 308455 304414
## your just his an about its out up one
## 300643 300440 300163 297221 294537 293603 290995 287553 284646
## what if like has when
## 269501 268396 266934 259346 258872
barplot(head(tb, 10), col = "black")

Later, we will conduct statistical analysis and train models to eventually build the app.