Text Cleaning
library(tm)
Build corpus
# build a corpus, and specify the source to be character vectors
myCorpus <- Corpus(VectorSource(tweets$text))
myCorpus1 <- Corpus(VectorSource(tweet1$text))
# convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
myCorpus1 <- tm_map(myCorpus1, content_transformer(tolower))
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))
myCorpus1 <- tm_map(myCorpus1, content_transformer(removeURL))
# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))
myCorpus1 <- tm_map(myCorpus1, content_transformer(removeNumPunct))
# remove stopwords
myStopwords <- c(setdiff(stopwords('english'), c("r", "big")), "use", "kak", "kakak", "yuk", "ya")
stopwords_id <- read.table('stopwords-id.txt', header = FALSE)
myStopwords <- c(myStopwords, as.matrix(stopwords_id$V1), "hi", "yg")
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
myCorpus1 <- tm_map(myCorpus1, removeWords, myStopwords)
# remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)
myCorpus1 <- tm_map(myCorpus1, stripWhitespace)
# keep a copy for stem completion later
myCorpusCopy <- myCorpus
myCorpusCopy1 <- myCorpus1
Frequent Words
Build Term Document Matrix
tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))
tdm1 <- TermDocumentMatrix(myCorpus1, control = list(wordLengths = c(1, Inf)))
tdm
## <<TermDocumentMatrix (terms: 4715, documents: 654)>>
## Non-/sparse entries: 11190/3072420
## Sparsity : 100%
## Maximal term length: 31
## Weighting : term frequency (tf)
tdm1
## <<TermDocumentMatrix (terms: 5617, documents: 631)>>
## Non-/sparse entries: 14367/3529960
## Sparsity : 100%
## Maximal term length: 44
## Weighting : term frequency (tf)
Top Frequent Terms
freq.terms <- findFreqTerms(tdm, lowfreq = 20)
freq.terms1 <- findFreqTerms(tdm1, lowfreq = 20)
freq.terms[1:50]
## [1] "president" "trumps" "macron"
## [4] "trump" "de" "realdonaldtrump"
## [7] "doesnt" "much" "people"
## [10] "rain" "world" "votes"
## [13] "florida" "fraud" "election"
## [16] "like" "dont" "will"
## [19] "en" "la" "que"
## [22] "get" "going" "hes"
## [25] "just" "amp" "one"
## [28] "democrats" "house" "go"
## [31] "russia" "now" "even"
## [34] "putin" "military" "paris"
## [37] "us" "day" "im"
## [40] "didnt" "still" "Ã"
## [43] "american" "america" "cant"
## [46] "donald" "le" "can"
## [49] "never" "vote"
freq.terms1[1:50]
## [1] "akparti" "bu"
## [3] "da" "de"
## [5] "eyt" "kadar"
## [7] "rterdogan" "sayn"
## [9] "Ã" "mhpbilgi"
## [11] "n" "numankurtulmus"
## [13] "sÃ" "yok"
## [15] "atatÃ" "bir"
## [17] "daha" "dÃ"
## [19] "en" "erdoan"
## [21] "geÃ" "gÃ"
## [23] "in" "iÃ"
## [25] "ne" "nÃ"
## [27] "ve" "dbdevletbahceli"
## [29] "ikalin" "tcbestepe"
## [31] "zehrazumruts" "biz"
## [33] "e" "emeklilikteyaataklanlar"
## [35] "k" "olarak"
## [37] "rÃ" "yÃ"
## [39] "beoydeildir" "bÃ"
## [41] "eytÃ" "mÃ"
## [43] "ok" "olan"
## [45] "var" "abdulhamitgul"
## [47] "istiyoruz" "sn"
## [49] "artk" "bekliyoruz"
term.freq <- rowSums(as.matrix(tdm))
term.freq1 <- rowSums(as.matrix(tdm1))
term.freq <- subset(term.freq, term.freq >= 50)
term.freq1 <- subset(term.freq1, term.freq1 >= 30)
df <- data.frame(term = names(term.freq), freq = term.freq)
df1 <- data.frame(term = names(term.freq1), freq = term.freq1)
par(mfrow=c(1,2))
ggplot(df, aes(x=term, y=freq)) + geom_bar(stat="identity") +
xlab("Terms") + ylab("Count") + coord_flip() +
theme(axis.text=element_text(size=7))

ggplot(df1, aes(x=term, y=freq)) + geom_bar(stat="identity") +
xlab("Terms") + ylab("Count") + coord_flip() +
theme(axis.text=element_text(size=7))

Wordcloud
Build Wordcloud
library(wordcloud)
m <- as.matrix(tdm)
m1 <- as.matrix(tdm1)
# calculate the frequency of words and sort it by frequency
word.freq <- sort(rowSums(m), decreasing = T)
word.freq1 <- sort(rowSums(m1), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")[-(1:5)]
par(mfrow=c(1,2))
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 20,
random.order = F, colors = pal)
wordcloud(words = names(word.freq1), freq = word.freq1, min.freq = 20,
random.order = F, colors = pal)
