This little documentation is related to the story of John Green, The Fault in our Stars, this is related to other faults in stars that seems to look at the distant ground of the earth we are living, they can’t see what is there.
No, this is the wordcloud from the story written by John Green.
setwd("E:/Other Business/DS_Projects/TFIOS_WordCloud/")
tfios.text <- read.table("fault-in-the-stars.txt", sep="\t")
library(tm)
## Warning: package 'tm' was built under R version 3.1.3
## Loading required package: NLP
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 3.1.2
corpus <- Corpus(VectorSource(tfios.text))
#corpus[[1]]
summary(tfios.text)
## V1
## : 199
## An Imperial Affliction: 31
## .” : 20
## * * * : 19
## . : 18
## The Price of Dawn : 9
## (Other) :4540
Apply to Lower function
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus,PlainTextDocument)
#corpus[[1]]
Remove Punctuations
corpus <- tm_map(corpus,removePunctuation)
#corpus[[1]]
Remove StopWords and word “said” which is most common.
corpus <- tm_map(corpus,removeWords,c(stopwords("english"),"said"))
#corpus[[1]]
Make as PlainText Document
corpus <- tm_map(corpus,PlainTextDocument)
#corpus[[1]]
Quick Fix for Corpus
corpus <- Corpus(VectorSource(corpus))
#corpus[[1]]
Get Frequencies, and get those with at least frequency of 2000.
frequencies <- DocumentTermMatrix(corpus)
#frequencies
Convert into data frame
df.text <- as.data.frame(as.matrix(frequencies))
#head(df.text)
Colnames conversion
colnames(df.text) <- make.names(colnames(df.text))
length(unique(colnames(df.text)))
## [1] 7595
Creating wordcloud
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.1.3
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 3.1.2
wordcloud(colnames(df.text),colSums(df.text),random.order=FALSE,scale=c(2,0.25),colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(colnames(df.text), colSums(df.text), random.order =
## FALSE, : vondelpark could not be fit on page. It will not be plotted.