## [1] "Carol goes plainclothes to continue the investigation, to get close to the arms dealer on a plane. She has a nightmare about herself in a Black Queen outfit, being forced to kill a girl named Rogue before she \"strips from you everything you are, everyone you ever loved.\" She also doesn't realize she was spotted by Tessa of the Hellfire Club (whom, I believe, was retroactively made a hero by Claremont despite this) who sets the Brotherhood of Evil Mutants on her."
## [1] "Human rights groups and many members of the legal community say the reforms have not gone far enough and the only legitimate way to prosecute Mohammed is in a civilian court, not a commission with a jury of Pentagon-appointed military officers and an Army colonel for a judge."
## [1] "ya the rams theoretically should be a serious contender for awhile if bradford is healthy"
## feature frequency rank docfreq
## 1 of_the 42984 1 35373
## 2 in_the 40729 2 35222
## 3 to_the 21092 3 19346
## 4 for_the 19981 4 18872
## 5 on_the 19750 5 18174
## 6 to_be 16117 6 14796
## 7 at_the 14104 7 13252
## 8 and_the 12585 8 11696
## 9 in_a 11841 9 11202
## 10 with_the 10524 10 9934
## 11 is_a 10152 11 9570
## 12 it_was 9619 12 8633
## 13 for_a 9489 13 9127
## 14 i_was 8749 14 7395
## 15 from_the 8735 15 8256
## 16 i_have 8704 16 7776
## 17 it_is 8253 17 7335
## 18 with_a 8139 18 7720
## 19 and_i 8080 19 7476
## 20 will_be 8045 20 7398
## 21 going_to 7995 21 7336
## 22 of_a 7823 22 7437
## 23 i_am 7667 23 6622
## 24 have_a 7544 24 7306
## 25 is_the 7384 25 7047
## feature frequency rank docfreq
## 1 of_the 42984 1 35373
## 2 in_the 40729 2 35222
## 3 to_the 21092 3 19346
## 4 for_the 19981 4 18872
## 5 on_the 19750 5 18174
## 6 to_be 16117 6 14796
## 7 at_the 14104 7 13252
## 8 and_the 12585 8 11696
## 9 in_a 11841 9 11202
## 10 with_the 10524 10 9934
## 11 is_a 10152 11 9570
## 12 it_was 9619 12 8633
## 13 for_a 9489 13 9127
## 14 i_was 8749 14 7395
## 15 from_the 8735 15 8256
## 16 i_have 8704 16 7776
## 17 it_is 8253 17 7335
## 18 with_a 8139 18 7720
## 19 and_i 8080 19 7476
## 20 will_be 8045 20 7398
## 21 going_to 7995 21 7336
## 22 of_a 7823 22 7437
## 23 i_am 7667 23 6622
## 24 have_a 7544 24 7306
## 25 is_the 7384 25 7047
A
# clear workspace & remove variables
#knitr::opts_chunk$set(echo=TRUE)
#rm(list = ls())
library(quanteda)
library(ggplot2)
library(wordcloud)
### Reading in US Blogs
file <- ("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
con <- file(description=file, open="r")
line <- readLines(con, encoding="UTF-8") # ADDING ENCODING HERE FIXED MY ISSUE!!!!!!!!!!!! FOR TEXT.... :( now do to fix
close(con)
#Reading in US News
file2 <- ('./Coursera-SwiftKey/final/en_US/en_US.news.txt')
con2 <- file(description=file2, "rb")
line2 <- readLines(con2, n=-1, warn=TRUE, skipNul = TRUE, encoding="UTF-8")
close(con2)
#Reading in US Twitter
file3 <- ('./Coursera-SwiftKey/final/en_US/en_US.twitter.txt')
con3 <- file(description=file3)
line3 <- readLines(con3, n=-1, warn=TRUE, skipNul = TRUE, encoding="UTF-8")
close(con3)
B
### Taking 10% sample of each data##until I get a working model for speed sake...then increase up to 25%?
# .10% sample of BLOGS 90,000
0.10 * length(line)
blogS <- sample(line, 90000)
# .10% sample of NEWS is 101000
0.10 * length(line2)
newsS <- sample(line2, 101000)
# 10% sample of TWITTER is 236000
0.10 * length(line3)
twitterS <- sample(line3, 236000)
C
#Creating corpus Document, labels did not "stick" or I haven't figured out how to correctly subset. I might not need this in the end, but I wanted it for EDA
myCorpus <- corpus(blogS)
docvars(myCorpus, "Source") <- "blog"
#summary(myCorpus)
myCorpus2 <- corpus(newsS)
docvars(myCorpus2, "Source") <- "news"
#summary(myCorpus2)
myCorpus3 <- corpus(twitterS)
docvars(myCorpus3, "Source") <- "twitter"
#summary(myCorpus3)
allCorpus <- myCorpus + myCorpus2 + myCorpus3
tail(summary(allCorpus))
D
tokenInfoB <- summary(myCorpus)
b <- tokenInfoB[which.max(tokenInfoB$Tokens),]
texts(blogS)[b$Tokens] # longest length text in my blog sample
tokenInfoN <- summary(myCorpus2)
c <- tokenInfoN[which.max(tokenInfoN$Tokens),]
texts(newsS)[c$Tokens] # longest length text in my news sample
tokenInfoT <- summary(myCorpus3)
d<- tokenInfoT[which.max(tokenInfoT$Tokens),]
texts(twitterS)[d$Tokens] # longest length text in my twitter sample
E
## Plot of most frequent words in Blogs, News & Twitter
#blogs
dfm_Blog <- dfm(myCorpus, tolower = TRUE, remove_punct=TRUE, remove=swearwords)
features_blog_dfm <- textstat_frequency(dfm_Blog, n=100)
features_blog_dfm$feature <- with(features_blog_dfm, reorder(feature, -frequency))
ggplot(features_blog_dfm, aes(x=feature, y=frequency))+
geom_point() +
theme(axis.text.x=element_text(angle = 90, hjust=1))+
ggtitle("Top 100 Most Frequenct Words in Blogs Document")
#news
dfm_Blog2 <- dfm(myCorpus2, tolower = TRUE, remove_punct=TRUE, remove=swearwords)
features_blog_dfm2 <- textstat_frequency(dfm_Blog2, n=100)
features_blog_dfm2$feature <- with(features_blog_dfm2, reorder(feature, -frequency))
ggplot(features_blog_dfm2, aes(x=feature, y=frequency))+
geom_point() +
theme(axis.text.x=element_text(angle = 90, hjust=1))+
ggtitle("Top 100 Most Frequenct Words in News Document")
#twitter
dfm_Blog3 <- dfm(myCorpus3, tolower = TRUE, remove_punct=TRUE, remove=swearwords)
features_blog_dfm3 <- textstat_frequency(dfm_Blog3, n=100)
features_blog_dfm3$feature <- with(features_blog_dfm3, reorder(feature, -frequency))
ggplot(features_blog_dfm3, aes(x=feature, y=frequency))+
geom_point() +
theme(axis.text.x=element_text(angle = 90, hjust=1))+
ggtitle("Top 100 Most Frequenct Words in Twitter Document")
F
### Creating n-grams size 2 for prediction model
dat.dfm <- dfm(allCorpus, remove=swearwords, ngrams=2, tolower = TRUE, remove_punct = TRUE, what = "fasterword", verbose = FALSE)
summary(topfeatures(dat.dfm, 20))
topfeatures(dat.dfm)
#Creating n-grams size 3 for prediction model
dat.dfm3 <- dfm(allCorpus, remove=swearwords, ngrams=3, tolower = TRUE, remove_punct = TRUE, what = "fasterword", verbose = FALSE)
summary(topfeatures(dat.dfm3, 20))
topfeatures(dat.dfm3)