con <- file("en_US.twitter.txt", "r")
dataTwitter <- readLines(con)
close(con)
con <- file("en_US.blogs.txt", "r")
dataBlogs <- readLines(con)
close(con)
con <- file("en_US.news.txt", "r")
dataNews <- read.table(con,sep="\n",quote = "", header=F,stringsAsFactors = F)
close(con)
library(tm)
library(RWeka)
library(ggplot2)
A basic summary about the three datasets:
library(ngram)
dataS<-cbind(c(length(dataTwitter),length(dataBlogs),length(dataNews[,1])),
c(wordcount(dataTwitter),wordcount(dataBlogs),wordcount(dataNews[,1])))
dataSummary<-data.frame(dataset=c("dataTwitter","dataBlogs","dataNews"),dataS)
names(dataSummary)[2:3]<-c("Length","Wordcount")
dataSummary
## dataset Length Wordcount
## 1 dataTwitter 2360148 30373543
## 2 dataBlogs 899288 37334131
## 3 dataNews 77258 2643184
set.seed(123)
test <- sample(dataTwitter,1000)
corpora <- Corpus(VectorSource(test))
corpora <- tm_map(corpora,removeWords,stopwords("english"))
funs <- list(stripWhitespace,
removePunctuation,
removeNumbers,
content_transformer(tolower))
corpora <- tm_map(corpora, FUN = tm_reduce, tmFuns = funs)
corpora[[1]][1]
## $content
## [1] "thanks rting"
for (j in seq(corpora)){
corpora[[j]] <- gsub("$|&|^|<|>", " ", corpora[[j]])
corpora[[j]] <- gsub("/", " ", corpora[[j]])
corpora[[j]] <- gsub("@", " ", corpora[[j]])
corpora[[j]] <- gsub("#", " ", corpora[[j]])
}
corpora <- tm_map(corpora, PlainTextDocument)
ngramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
tdm <- TermDocumentMatrix(corpora, control = list(tokenize = ngramTokenizer))
tdmM <- as.matrix(tdm)
frequencies <- sort(rowSums(tdmM),decreasing = T)[1:20]
df <- data.frame(word=names(frequencies), freq=frequencies)
## just like love good now will know one can the
## 59 59 49 48 44 43 41 41 40 39
## get thanks today day lol time got new life want
## 37 36 33 32 31 31 28 28 23 23
And a bit more information with some ‘tm’ functions:
findFreqTerms(tdm,lowfreq=20)
## [1] "back" "beep" "can" "day" "follow" "get" "good"
## [8] "got" "great" "just" "know" "life" "like" "lol"
## [15] "love" "make" "new" "now" "one" "people" "really"
## [22] "see" "thanks" "the" "think" "time" "today" "want"
## [29] "will"
findAssocs(tdm,"just",corlimit=.2)
## $just
## walk hundred uswirl miles
## 0.28 0.26 0.26 0.23
library(ggplot2)
g <- ggplot(df, aes(word,freq))
g <- g + geom_bar(stat="identity",fill="white",colour = "green") + theme_bw()
g <- g + theme(axis.text.x=element_text(angle=15, hjust=1))
g
library(wordcloud)
set.seed(123)
wordcloud(names(frequencies), frequencies, min.freq=2,colors=brewer.pal(2, "Pastel2"))
findFreqTerms(tdm,lowfreq=7)
## [1] "beep beep" "feel like" "i can" "i feel" "i just"
## [6] "i know" "i love" "i think" "i want" "right now"
findAssocs(tdm,"i love",corlimit=.2)
## $`i love`
## love i love somebody â<U+0098><U+0085>party party almost ten
## 0.46 0.46 0.23 0.23
## bad disney bday jordy behind always coming sr
## 0.23 0.23 0.23 0.23
## commercial mchips day hb desert view disney no
## 0.23 0.23 0.23 0.23
## drive almost east rim everyday weekâ<U+0098><U+0085> favor tweets
## 0.23 0.23 0.23 0.23
## gichy gichy gichy goo girl im goo not
## 0.23 0.23 0.23 0.23
## grandcanyon i happy bday hb rt hope know
## 0.23 0.23 0.23 0.23
## i east i morning i rly id rather
## 0.23 0.23 0.23 0.23
## im behind ive smiled jamming pandora jordy your
## 0.23 0.23 0.23 0.23
## keep life know love let keep lived hear
## 0.23 0.23 0.23 0.23
## loser i love â<U+0099> love commercial love disney
## 0.23 0.23 0.23 0.23
## love favor love id love jamming love life
## 0.23 0.23 0.23 0.23
## love loser love ma love momma love s
## 0.23 0.23 0.23 0.23
## love view love watching love yew ma youre
## 0.23 0.23 0.23 0.23
## make day make let mean i model i
## 0.23 0.23 0.23 0.23
## morning long no bad not mean now girl
## 0.23 0.23 0.23 0.23
## pandora ðÿ<U+009E> party everyday party party people i
## 0.23 0.23 0.23 0.23
## playrt make rather see rim drive rly think
## 0.23 0.23 0.23 0.23
## rt hoosiernation s nephew see lived smiled much
## 0.23 0.23 0.23 0.23
## sr yr ten years think ive view desert
## 0.23 0.23 0.23 0.23
## view grandcanyon watching playrt weekâ<U+0098><U+0085> i with good
## 0.23 0.23 0.23 0.23
## your role youre coming yr now
## 0.23 0.23 0.23
## $content
## [1] "to sum unjustly wounded men let us overlook wickedness worsen pain sharpen minds revenge remember mount god learn believe certain whatever enemy wickedly committed us permitted sent godâ<U+0080><U+0099>s just dispensation calvin â<U+0080><U+0093> institutes "
## the will one just can time like people know now
## 212 159 135 125 108 100 96 83 72 72
## see get really and think way also day want back
## 69 65 62 61 61 61 60 60 59 56
## $content
## [1] "riveras works beatnik studios show bold color expressionistic handling full references precolumbian cultures work depicts male figure subtitled nahuatl aztec word yaotl nahuatl warrior depicts fearsome visage"
## the said will one new also can but two years
## 524 502 228 155 142 141 133 125 115 115
## first just time year last like â<U+0080><U+0094> city people state
## 113 112 106 100 97 92 87 87 84 83