Length of files
con <- file("/home/HK/Documents/Coursera/Data_science/Capstone/final/Analysis/check/en_US.twitter.txt", "r")
a <-readLines(con, skipNul = TRUE)
close(con)
length(a)
## [1] 2360148
Blogs
con <- file("/home/HK/Documents/Coursera/Data_science/Capstone/final/Analysis/check/en_US.blogs.txt", "r")
a <-readLines(con, skipNul = TRUE)
close(con)
length(a)
## [1] 899288
News
con <- file("/home/HK/Documents/Coursera/Data_science/Capstone/final/Analysis/check/en_US.news.txt", "r")
a <-readLines(con, skipNul = TRUE)
close(con)
length(a)
## [1] 1010242
The main files, being so large, were splitted in to smaller ones. The samples were taken out of these files randomly
con <- file("/home/HK/Documents/Coursera/Data_science/Capstone/final/Analysis/check/en_US.twitter.txt", "r") # replace with your path
a <-readLines(con, skipNul = TRUE) ## Read first 10000 lines of text
close(con) ## It's important to close the connection when you are done
# doing 10000 lines at a time. Will result in 236 files. The last 148 or so lines will be omitted
twitter_index<- seq(from = 10000, to = 2360000, by = 10000)
# for first twitter index
tmp<- a[1:twitter_index[1]]
filename<- "twitter_data_0.txt"
write(tmp, file = filename)
length_tmp<- 0
for(i in 1:length(twitter_index)){
last_i<- twitter_index[i]
range_start<- (last_i)+1
range_end<- range_start+9999
tmp<- a[range_start:range_end]
#tmp<- a[twitter_index[i-1]+1:twitter_index[i]]
filename<- paste0("twitter_data_", i,".txt")
write(tmp, file = filename)
length_tmp<- c(length_tmp, length(tmp))
remove(tmp)
}
## For Blogs
con <- file("/home/HK/Documents/Coursera/Data_science/Capstone/final/Analysis/check/en_US.blogs.txt", "r") # replace with your path
a <-readLines(con, skipNul = TRUE) ## Read first 10000 lines of text
close(con) ## It's important to close the connection when you are done
blog_index<- seq(from = 8990, to = 899000, by = 8990)
setwd("/home/HK/Documents/Coursera/Data_science/Capstone/final/Intermidiate files/blogs/")
# for first twitter index
tmp<- a[1:blog_index[1]]
filename<- "blog_data_0.txt"
write(tmp, file = filename)
length_tmp<- 0
for(i in 1:length(blog_index)){
last_i<- blog_index[i]
range_start<- (last_i)+1
range_end<- range_start+8990
tmp<- a[range_start:range_end]
#tmp<- a[blog_index[i-1]+1:blog_index[i]]
filename<- paste0("blog_data_", i,".txt")
write(tmp, file = filename)
length_tmp<- c(length_tmp, length(tmp))
remove(tmp)
}
## For News
con <- file("/home/HK/Documents/Coursera/Data_science/Capstone/final/Analysis/check/en_US.news.txt", "r") # replace with your path
a <-readLines(con, skipNul = TRUE) ## Read first 10000 lines of text
close(con) ## It's important to close the connection when you are done
news_index<- seq(from = 10000, to = 1010000, by = 10000)
setwd("/home/HK/Documents/Coursera/Data_science/Capstone/final/Intermidiate files/news/")
# for first news index
tmp<- a[1:news_index[1]]
filename<- "news_data_0.txt"
write(tmp, file = filename)
length_tmp<- 0
for(i in 1:length(news_index)){
last_i<- news_index[i]
range_start<- (last_i)+1
range_end<- range_start+9999
tmp<- a[range_start:range_end]
#tmp<- a[news_index[i-1]+1:news_index[i]]
filename<- paste0("news_data_", i,".txt")
write(tmp, file = filename)
length_tmp<- c(length_tmp, length(tmp))
remove(tmp)
}
How we are sampling the files here
set.seed(123)
twitter <- sample(0:236, 1)
blog <- sample(0:102, 1)
news<- sample(0:102, 1)
# Print and then manually store these files in a separate folder
twitter
## [1] 68
blog
## [1] 81
news
## [1] 42
Loading Basic libraries which we are going to use later on.
library(RColorBrewer)
library(tm)
library(SnowballC)
library(ggplot2)
library(Rgraphviz) # Correlation plots.
library(magrittr)
Loading the sample data. The sampled files above were manually saved in to a separate folder
# setting up file path to the documents we are going to use
cname<- file.path("/home/HK/Documents/Coursera/Data_science/Capstone/final/Intermidiate files/first_sample/")
dir(cname)
## [1] "blog_data_81.txt" "news_data_42.txt" "twitter_data_68.txt"
length(dir(cname))
## [1] 3
# Loading documents in a Corpus so that we can continue with analysis
docs<- Corpus(DirSource(cname), readerControl = list(language = "en"))
#inspect(docs)
Some transformations that were done on the data which include removeNumbers, removePunctuation, removeStopwords, removeWhitespace
# Own function for pre processing
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "_|)|:|;|!|=|#|/|@|\\|")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs,stemDocument, language = "english")
my_stopwords <- c(stopwords('english'), 'fuck', 'nigar', 'slut', 'slutty', 'sluty', 'shit')
docs <- tm_map(docs, removeWords, my_stopwords)
docs <- tm_map(docs, stripWhitespace)
docs<- tm_map(docs, content_transformer(tolower))
After all the transformations have been made, the data has to be written in to a matrix. The following code does the trick.
# Converting the resulting doc into a matrix for further analysis - can be converted to a matrix by using as.matrix()
dtm <- DocumentTermMatrix(docs)
dtm
## <<DocumentTermMatrix (documents: 3, terms: 36469)>>
## Non-/sparse entries: 54046/55361
## Sparsity : 51%
## Maximal term length: 322
## Weighting : term frequency (tf)
The above shows us some information about the matrix our data is stored in. Note that it is around 51% sparse(empty)
Removing sparse terms.
dtm <- removeSparseTerms(dtm, sparse=0.3)
dtm
## <<DocumentTermMatrix (documents: 3, terms: 5681)>>
## Non-/sparse entries: 17043/0
## Sparsity : 0%
## Maximal term length: 13
## Weighting : term frequency (tf)
The sparsity has improved. This is a much more condensed form.
Some interesting findings out of the data - these functions can only be performed once the data is in a matrix format.
We can look at which words are the most abundant.
findFreqTerms(dtm, lowfreq=1400)
## [1] "can" "day" "get" "just" "like" "make" "new" "one" "said" "the"
## [11] "time" "will" "year"
We can also look at what corelations do these words have with others in the dataset.
findAssocs(dtm, "time", corlimit=0.9)
## time
## abl 1.00
## abraham 1.00
## absolut 1.00
## accept 1.00
## accur 1.00
## advic 1.00
## alien 1.00
## aliv 1.00
## alreadi 1.00
## ancient 1.00
## and 1.00
## angel 1.00
## angl 1.00
## ani 1.00
## anoth 1.00
## anywher 1.00
## applic 1.00
## arab 1.00
## arch 1.00
## arm 1.00
## around 1.00
## attir 1.00
## australia 1.00
## avocado 1.00
## awar 1.00
## away 1.00
## bag 1.00
## bakeri 1.00
## bald 1.00
## bare 1.00
## basi 1.00
## beginn 1.00
## behavior 1.00
## belong 1.00
## besid 1.00
## best 1.00
## bin 1.00
## black 1.00
## blair 1.00
## blood 1.00
## blue 1.00
## boil 1.00
## bolt 1.00
## bomb 1.00
## bone 1.00
## bookstor 1.00
## bottl 1.00
## bought 1.00
## box 1.00
## breed 1.00
## breez 1.00
## bright 1.00
## brock 1.00
## brooklyn 1.00
## bundl 1.00
## cargo 1.00
## castl 1.00
## casual 1.00
## celebr 1.00
## cheaper 1.00
## childhood 1.00
## choic 1.00
## chop 1.00
## chose 1.00
## clean 1.00
## clip 1.00
## closer 1.00
## clue 1.00
## colonel 1.00
## comfort 1.00
## command 1.00
## commenc 1.00
## complain 1.00
## complianc 1.00
## composit 1.00
## condemn 1.00
## confess 1.00
## confus 1.00
## contin 1.00
## costum 1.00
## cough 1.00
## coupl 1.00
## cover 1.00
## cowork 1.00
## cramp 1.00
## cultur 1.00
## cup 1.00
## curat 1.00
## cycl 1.00
## dash 1.00
## dead 1.00
## decor 1.00
## delight 1.00
## depend 1.00
## detach 1.00
## dialogu 1.00
## difficulti 1.00
## digest 1.00
## disappoint 1.00
## disast 1.00
## discov 1.00
## disturb 1.00
## dot 1.00
## drag 1.00
## dri 1.00
## drinker 1.00
## due 1.00
## dutch 1.00
## easier 1.00
## eleven 1.00
## emot 1.00
## enorm 1.00
## enough 1.00
## essenti 1.00
## ethnic 1.00
## even 1.00
## everywher 1.00
## evolut 1.00
## exampl 1.00
## excel 1.00
## exercis 1.00
## experi 1.00
## explain 1.00
## extens 1.00
## fade 1.00
## far 1.00
## fear 1.00
## felt 1.00
## femal 1.00
## fenc 1.00
## fighter 1.00
## fill 1.00
## flex 1.00
## flicker 1.00
## form 1.00
## format 1.00
## forum 1.00
## freedom 1.00
## front 1.00
## full 1.00
## galaxi 1.00
## gentl 1.00
## german 1.00
## gone 1.00
## grandmoth 1.00
## guid 1.00
## hand 1.00
## happen 1.00
## haw 1.00
## head 1.00
## heat 1.00
## heroic 1.00
## heroin 1.00
## historian 1.00
## hooker 1.00
## horizon 1.00
## huge 1.00
## human 1.00
## ignor 1.00
## incompet 1.00
## incred 1.00
## influenti 1.00
## infrastructur 1.00
## inject 1.00
## intak 1.00
## interest 1.00
## internet 1.00
## interpret 1.00
## intuit 1.00
## iron 1.00
## italian 1.00
## item 1.00
## jean 1.00
## jona 1.00
## joshua 1.00
## kind 1.00
## knew 1.00
## kurt 1.00
## leav 1.00
## lesbian 1.00
## lightn 1.00
## llc 1.00
## load 1.00
## long 1.00
## longer 1.00
## lot 1.00
## louis 1.00
## loyalti 1.00
## make 1.00
## male 1.00
## manner 1.00
## marri 1.00
## marvel 1.00
## mass 1.00
## mat 1.00
## max 1.00
## mental 1.00
## mere 1.00
## messi 1.00
## metabol 1.00
## might 1.00
## mint 1.00
## mirror 1.00
## mislead 1.00
## moral 1.00
## more 1.00
## mortal 1.00
## mud 1.00
## music 1.00
## muslim 1.00
## naiv 1.00
## nake 1.00
## nat 1.00
## natur 1.00
## night 1.00
## nois 1.00
## none 1.00
## nose 1.00
## obes 1.00
## odd 1.00
## often 1.00
## once 1.00
## one 1.00
## onli 1.00
## onlin 1.00
## otherwis 1.00
## outlook 1.00
## oven 1.00
## overwhelm 1.00
## packag 1.00
## packet 1.00
## paint 1.00
## paperwork 1.00
## parallel 1.00
## parti 1.00
## peggi 1.00
## phrase 1.00
## piec 1.00
## pine 1.00
## plenti 1.00
## plot 1.00
## poland 1.00
## polar 1.00
## politician 1.00
## porch 1.00
## pregnanc 1.00
## pregnant 1.00
## preview 1.00
## prey 1.00
## princ 1.00
## principl 1.00
## prix 1.00
## prize 1.00
## protocol 1.00
## punch 1.00
## purpos 1.00
## quiz 1.00
## rant 1.00
## rape 1.00
## react 1.00
## reason 1.00
## reconstruct 1.00
## refere 1.00
## reflect 1.00
## refresh 1.00
## regim 1.00
## reinforc 1.00
## relationship 1.00
## remark 1.00
## remov 1.00
## resembl 1.00
## rest 1.00
## review 1.00
## revolut 1.00
## ride 1.00
## root 1.00
## row 1.00
## runaway 1.00
## sam 1.00
## sampl 1.00
## samuel 1.00
## satisfact 1.00
## scan 1.00
## search 1.00
## sens 1.00
## shape 1.00
## share 1.00
## she 1.00
## shed 1.00
## sheet 1.00
## shield 1.00
## shock 1.00
## short 1.00
## shorter 1.00
## shoulder 1.00
## shown 1.00
## shutdown 1.00
## silenc 1.00
## simplifi 1.00
## sink 1.00
## sister 1.00
## skill 1.00
## skirt 1.00
## slip 1.00
## soil 1.00
## somehow 1.00
## somewhat 1.00
## sonic 1.00
## soup 1.00
## specif 1.00
## spectacular 1.00
## spoil 1.00
## spous 1.00
## spray 1.00
## squeez 1.00
## stalk 1.00
## stanc 1.00
## steer 1.00
## sticker 1.00
## storag 1.00
## strang 1.00
## strawberri 1.00
## strip 1.00
## submiss 1.00
## sugar 1.00
## suitcas 1.00
## sweat 1.00
## sympathi 1.00
## tabl 1.00
## take 1.00
## talent 1.00
## technic 1.00
## teenag 1.00
## ten 1.00
## tend 1.00
## tension 1.00
## tent 1.00
## their 1.00
## theme 1.00
## themselv 1.00
## these 1.00
## thigh 1.00
## through 1.00
## tight 1.00
## toddler 1.00
## total 1.00
## trademark 1.00
## tragedi 1.00
## transform 1.00
## transmiss 1.00
## treasur 1.00
## trick 1.00
## twist 1.00
## unless 1.00
## unravel 1.00
## upper 1.00
## use 1.00
## usher 1.00
## voic 1.00
## volum 1.00
## wasnt 1.00
## way 1.00
## web 1.00
## wed 1.00
## weight 1.00
## wheat 1.00
## wherea 1.00
## wind 1.00
## wisdom 1.00
## wizard 1.00
## world 1.00
## worn 1.00
## worri 1.00
## wors 1.00
## worship 1.00
## wrap 1.00
## wrinkl 1.00
## written 1.00
## abil 0.99
## abov 0.99
## absurd 0.99
## achiev 0.99
## african 0.99
## alon 0.99
## annoy 0.99
## aris 0.99
## assassin 0.99
## aunt 0.99
## bake 0.99
## barbecu 0.99
## barrier 0.99
## basic 0.99
## belief 0.99
## beneath 0.99
## bite 0.99
## blade 0.99
## blew 0.99
## boot 0.99
## brand 0.99
## british 0.99
## broke 0.99
## broken 0.99
## brutal 0.99
## bulk 0.99
## bunni 0.99
## burst 0.99
## canadian 0.99
## charm 0.99
## chemic 0.99
## chest 0.99
## child 0.99
## christian 0.99
## coaster 0.99
## coat 0.99
## compil 0.99
## complet 0.99
## compos 0.99
## constant 0.99
## corn 0.99
## cotton 0.99
## cousin 0.99
## crack 0.99
## craft 0.99
## creat 0.99
## creation 0.99
## dark 0.99
## decid 0.99
## dedic 0.99
## differ 0.99
## dip 0.99
## doe 0.99
## doesnt 0.99
## door 0.99
## doubt 0.99
## dough 0.99
## downward 0.99
## each 0.99
## easili 0.99
## either 0.99
## email 0.99
## embrac 0.99
## emili 0.99
## emma 0.99
## empti 0.99
## enemi 0.99
## english 0.99
## enthusiast 0.99
## entri 0.99
## equival 0.99
## everi 0.99
## except 0.99
## experienc 0.99
## fact 0.99
## familiar 0.99
## father 0.99
## film 0.99
## fit 0.99
## flame 0.99
## fold 0.99
## formula 0.99
## frame 0.99
## from 0.99
## frustrat 0.99
## fuss 0.99
## garag 0.99
## genr 0.99
## give 0.99
## goe 0.99
## golden 0.99
## googl 0.99
## grave 0.99
## green 0.99
## grow 0.99
## guilt 0.99
## hard 0.99
## harvest 0.99
## haunt 0.99
## hole 0.99
## holiday 0.99
## horror 0.99
## hour 0.99
## household 0.99
## howev 0.99
## humbl 0.99
## humor 0.99
## husband 0.99
## icon 0.99
## individu 0.99
## inevit 0.99
## instruct 0.99
## interact 0.99
## introduc 0.99
## introduct 0.99
## isnt 0.99
## jacket 0.99
## jane 0.99
## japanes 0.99
## joke 0.99
## justifi 0.99
## keep 0.99
## knife 0.99
## laid 0.99
## lemon 0.99
## lie 0.99
## liquid 0.99
## literatur 0.99
## live 0.99
## log 0.99
## lone 0.99
## magic 0.99
## mail 0.99
## malcolm 0.99
## match 0.99
## meant 0.99
## medium 0.99
## method 0.99
## mexican 0.99
## mindset 0.99
## model 0.99
## most 0.99
## movement 0.99
## must 0.99
## neither 0.99
## noodl 0.99
## notic 0.99
## obvious 0.99
## occas 0.99
## onc 0.99
## order 0.99
## ordinari 0.99
## our 0.99
## path 0.99
## peg 0.99
## pepper 0.99
## perhap 0.99
## philosophi 0.99
## photograph 0.99
## piano 0.99
## place 0.99
## plastic 0.99
## playground 0.99
## portray 0.99
## precious 0.99
## pride 0.99
## proport 0.99
## pull 0.99
## pure 0.99
## put 0.99
## quiet 0.99
## rack 0.99
## rare 0.99
## regard 0.99
## religion 0.99
## resist 0.99
## resolut 0.99
## retreat 0.99
## reveal 0.99
## ridicul 0.99
## right 0.99
## salt 0.99
## satellit 0.99
## scrap 0.99
## scrape 0.99
## seen 0.99
## seller 0.99
## seren 0.99
## serious 0.99
## shrug 0.99
## silent 0.99
## slight 0.99
## snow 0.99
## societi 0.99
## solid 0.99
## sorrow 0.99
## sparkl 0.99
## speak 0.99
## spear 0.99
## spici 0.99
## spiral 0.99
## squash 0.99
## steam 0.99
## stone 0.99
## stripe 0.99
## studio 0.99
## stunt 0.99
## style 0.99
## subject 0.99
## succeed 0.99
## suppos 0.99
## swim 0.99
## tap 0.99
## tape 0.99
## tast 0.99
## tear 0.99
## techniqu 0.99
## then 0.99
## tini 0.99
## titan 0.99
## togeth 0.99
## tone 0.99
## tongu 0.99
## toy 0.99
## tshirt 0.99
## tube 0.99
## type 0.99
## uncomfort 0.99
## unfortun 0.99
## unknown 0.99
## until 0.99
## urgent 0.99
## vagu 0.99
## valuabl 0.99
## vanish 0.99
## vintag 0.99
## walk 0.99
## want 0.99
## weak 0.99
## window 0.99
## worst 0.99
## worth 0.99
## wouldnt 0.99
## yellow 0.99
## absent 0.98
## act 0.98
## adapt 0.98
## add 0.98
## admit 0.98
## adult 0.98
## africa 0.98
## ala 0.98
## album 0.98
## alicia 0.98
## all 0.98
## armstrong 0.98
## arrow 0.98
## back 0.98
## batteri 0.98
## becom 0.98
## begin 0.98
## believ 0.98
## bell 0.98
## bias 0.98
## bigger 0.98
## blond 0.98
## blow 0.98
## bow 0.98
## bread 0.98
## brother 0.98
## bump 0.98
## bust 0.98
## calendar 0.98
## capabl 0.98
## captain 0.98
## carv 0.98
## caviar 0.98
## cheap 0.98
## chicken 0.98
## chosen 0.98
## circul 0.98
## collater 0.98
## comment 0.98
## compel 0.98
## concept 0.98
## confidenti 0.98
## correct 0.98
## cottag 0.98
## cours 0.98
## creativ 0.98
## cricket 0.98
## crown 0.98
## curious 0.98
## dare 0.98
## defin 0.98
## descript 0.98
## destini 0.98
## digit 0.98
## discoveri 0.98
## drain 0.98
## drama 0.98
## draw 0.98
## ear 0.98
## easi 0.98
## erik 0.98
## evok 0.98
## experiment 0.98
## explod 0.98
## faith 0.98
## fall 0.98
## fate 0.98
## fault 0.98
## feed 0.98
## figur 0.98
## final 0.98
## find 0.98
## fireplac 0.98
## flash 0.98
## flesh 0.98
## flower 0.98
## food 0.98
## fragment 0.98
## fresh 0.98
## funki 0.98
## fuzzi 0.98
## garlic 0.98
## gem 0.98
## gospel 0.98
## gps 0.98
## grapefruit 0.98
## grate 0.98
## gratitud 0.98
## graviti 0.98
## groceri 0.98
## guin 0.98
## haiti 0.98
## healthi 0.98
## her 0.98
## hgtv 0.98
## hide 0.98
## ian 0.98
## illustr 0.98
## impress 0.98
## include 0.98
## incorpor 0.98
## india 0.98
## insignific 0.98
## instinct 0.98
## insult 0.98
## insurg 0.98
## intact 0.98
## invent 0.98
## invit 0.98
## juic 0.98
## julian 0.98
## jungl 0.98
## kati 0.98
## keyboard 0.98
## languag 0.98
## leaf 0.98
## lifestyl 0.98
## lipstick 0.98
## liu 0.98
## loos 0.98
## luke 0.98
## magnifi 0.98
## mainstream 0.98
## makeov 0.98
## mansion 0.98
## mask 0.98
## matur 0.98
## meaning 0.98
## microsoft 0.98
## miner 0.98
## miseri 0.98
## mix 0.98
## movi 0.98
## narrat 0.98
## narrow 0.98
## necessarili 0.98
## noisi 0.98
## opinion 0.98
## origin 0.98
## outdat 0.98
## overlook 0.98
## pair 0.98
## pasta 0.98
## peninsula 0.98
## peopl 0.98
## permiss 0.98
## pile 0.98
## polish 0.98
## pop 0.98
## preacher 0.98
## prefer 0.98
## preschool 0.98
## privileg 0.98
## progress 0.98
## pub 0.98
## queen 0.98
## radic 0.98
## rage 0.98
## republ 0.98
## resum 0.98
## revel 0.98
## revisit 0.98
## ring 0.98
## rocket 0.98
## salli 0.98
## salon 0.98
## saw 0.98
## scene 0.98
## scholar 0.98
## scream 0.98
## seem 0.98
## selfish 0.98
## shell 0.98
## shes 0.98
## shini 0.98
## shooter 0.98
## shop 0.98
## shower 0.98
## side 0.98
## simpl 0.98
## sip 0.98
## sit 0.98
## sky 0.98
## smoke 0.98
## soft 0.98
## solut 0.98
## spirit 0.98
## spit 0.98
## stain 0.98
## stapl 0.98
## start 0.98
## steward 0.98
## stick 0.98
## stori 0.98
## strand 0.98
## straw 0.98
## sudden 0.98
## sun 0.98
## sustain 0.98
## swing 0.98
## tale 0.98
## task 0.98
## tempt 0.98
## teresa 0.98
## there 0.98
## this 0.98
## thorough 0.98
## though 0.98
## toilet 0.98
## toler 0.98
## tomato 0.98
## ton 0.98
## tori 0.98
## tourist 0.98
## toward 0.98
## trace 0.98
## treat 0.98
## tree 0.98
## tri 0.98
## tuck 0.98
## turn 0.98
## twilight 0.98
## unaccept 0.98
## uncle 0.98
## understand 0.98
## underwear 0.98
## unfinish 0.98
## usualli 0.98
## veri 0.98
## vinyl 0.98
## visit 0.98
## vocabulari 0.98
## walnut 0.98
## warm 0.98
## warmer 0.98
## week 0.98
## well 0.98
## went 0.98
## wheel 0.98
## which 0.98
## whistl 0.98
## wikipedia 0.98
## woman 0.98
## wont 0.98
## yet 0.98
## advertis 0.97
## aisl 0.97
## ale 0.97
## almond 0.97
## although 0.97
## alway 0.97
## angri 0.97
## antiqu 0.97
## apolog 0.97
## appli 0.97
## becaus 0.97
## bottom 0.97
## brew 0.97
## butter 0.97
## buy 0.97
## calm 0.97
## can 0.97
## card 0.97
## carrot 0.97
## caus 0.97
## certain 0.97
## chain 0.97
## charact 0.97
## chariti 0.97
## chart 0.97
## choos 0.97
## cloth 0.97
## comparison 0.97
## content 0.97
## convers 0.97
## corrupt 0.97
## couldnt 0.97
## creami 0.97
## cross 0.97
## daili 0.97
## darker 0.97
## decent 0.97
## deep 0.97
## depress 0.97
## design 0.97
## desir 0.97
## die 0.97
## difficult 0.97
## disrupt 0.97
## distract 0.97
## dragon 0.97
## dust 0.97
## egg 0.97
## exclus 0.97
## fals 0.97
## fashion 0.97
## fiction 0.97
## flat 0.97
## fortun 0.97
## galleri 0.97
## gather 0.97
## generous 0.97
## genuin 0.97
## ghost 0.97
## given 0.97
## grandfath 0.97
## greater 0.97
## guitar 0.97
## hadnt 0.97
## hannah 0.97
## heavi 0.97
## idea 0.97
## innoc 0.97
## insert 0.97
## insight 0.97
## kid 0.97
## kindl 0.97
## label 0.97
## latter 0.97
## lay 0.97
## lean 0.97
## learn 0.97
## loom 0.97
## luxuri 0.97
## mani 0.97
## master 0.97
## memori 0.97
## moment 0.97
## moreov 0.97
## mother 0.97
## mouth 0.97
## mysteri 0.97
## name 0.97
## navig 0.97
## negat 0.97
## outlin 0.97
## pale 0.97
## parent 0.97
## pari 0.97
## percept 0.97
## philippin 0.97
## pirat 0.97
## proper 0.97
## psycholog 0.97
## quest 0.97
## quick 0.97
## quit 0.97
## ration 0.97
## romant 0.97
## rub 0.97
## rubber 0.97
## scientist 0.97
## scotland 0.97
## sea 0.97
## settl 0.97
## simpli 0.97
## sketch 0.97
## skin 0.97
## small 0.97
## smooth 0.97
## stir 0.97
## stress 0.97
## suggest 0.97
## taken 0.97
## teach 0.97
## theyd 0.97
## translat 0.97
## uniqu 0.97
## user 0.97
## usual 0.97
## vegetarian 0.97
## version 0.97
## view 0.97
## war 0.97
## wave 0.97
## when 0.97
## whimsic 0.97
## wore 0.97
## work 0.97
## acclaim 0.96
## aka 0.96
## albeit 0.96
## alec 0.96
## alice 0.96
## alli 0.96
## along 0.96
## ami 0.96
## amus 0.96
## anniversari 0.96
## answer 0.96
## aspect 0.96
## assert 0.96
## bedroom 0.96
## behalf 0.96
## belli 0.96
## better 0.96
## bound 0.96
## boy 0.96
## came 0.96
## chees 0.96
## chronic 0.96
## church 0.96
## citrus 0.96
## collar 0.96
## communic 0.96
## courtesi 0.96
## cow 0.96
## crafti 0.96
## crush 0.96
## daughter 0.96
## desert 0.96
## didnt 0.96
## direct 0.96
## distanc 0.96
## diva 0.96
## doll 0.96
## done 0.96
## drank 0.96
## dread 0.96
## drown 0.96
## edg 0.96
## entir 0.96
## especi 0.96
## exceed 0.96
## exclaim 0.96
## fanci 0.96
## fierc 0.96
## flick 0.96
## flow 0.96
## fog 0.96
## fond 0.96
## forgiv 0.96
## gabriel 0.96
## gate 0.96
## genius 0.96
## girlfriend 0.96
## grasp 0.96
## halfway 0.96
## hatch 0.96
## heal 0.96
## heel 0.96
## helen 0.96
## hollow 0.96
## hors 0.96
## hung 0.96
## ice 0.96
## imag 0.96
## imposs 0.96
## instant 0.96
## instead 0.96
## kingdom 0.96
## laptop 0.96
## lefti 0.96
## librari 0.96
## lisa 0.96
## littl 0.96
## livestock 0.96
## loyal 0.96
## luci 0.96
## lunch 0.96
## made 0.96
## margaret 0.96
## mediocr 0.96
## melt 0.96
## mileag 0.96
## nazi 0.96
## neat 0.96
## necessari 0.96
## old 0.96
## operat 0.96
## overlap 0.96
## pace 0.96
## pain 0.96
## pakistani 0.96
## palat 0.96
## patch 0.96
## person 0.96
## present 0.96
## priest 0.96
## probabl 0.96
## proclaim 0.96
## publish 0.96
## reassur 0.96
## reminisc 0.96
## remodel 0.96
## rich 0.96
## roof 0.96
## sake 0.96
## sauc 0.96
## sausag 0.96
## sentiment 0.96
## separ 0.96
## serial 0.96
## sinc 0.96
## size 0.96
## skull 0.96
## space 0.96
## spoken 0.96
## startl 0.96
## still 0.96
## stop 0.96
## store 0.96
## struggl 0.96
## sunset 0.96
## surpris 0.96
## tall 0.96
## templ 0.96
## thick 0.96
## tortur 0.96
## touch 0.96
## trader 0.96
## tragic 0.96
## undertak 0.96
## unhappi 0.96
## upcom 0.96
## updat 0.96
## veggi 0.96
## victoria 0.96
## violin 0.96
## visual 0.96
## wast 0.96
## water 0.96
## within 0.96
## abund 0.95
## actor 0.95
## almost 0.95
## altern 0.95
## array 0.95
## attitud 0.95
## autumn 0.95
## bark 0.95
## bodi 0.95
## bold 0.95
## boundari 0.95
## cake 0.95
## camera 0.95
## caramel 0.95
## carri 0.95
## ceas 0.95
## chase 0.95
## color 0.95
## core 0.95
## corner 0.95
## count 0.95
## demon 0.95
## detail 0.95
## dog 0.95
## ebook 0.95
## effect 0.95
## empire 0.95
## engag 0.95
## exist 0.95
## fluke 0.95
## for 0.95
## french 0.95
## fulli 0.95
## grill 0.95
## hamburg 0.95
## harder 0.95
## hold 0.95
## imagin 0.95
## impli 0.95
## import 0.95
## influenc 0.95
## intens 0.95
## islamic 0.95
## jew 0.95
## knowledg 0.95
## less 0.95
## lifetim 0.95
## lock 0.95
## main 0.95
## massiv 0.95
## meal 0.95
## mice 0.95
## minist 0.95
## minut 0.95
## mission 0.95
## mood 0.95
## network 0.95
## nowaday 0.95
## pack 0.95
## patienc 0.95
## pattern 0.95
## physic 0.95
## poor 0.95
## prepar 0.95
## print 0.95
## purchas 0.95
## recip 0.95
## relev 0.95
## respons 0.95
## rhythm 0.95
## roll 0.95
## rug 0.95
## sacr 0.95
## sacrific 0.95
## sand 0.95
## sat 0.95
## shade 0.95
## shake 0.95
## shi 0.95
## slim 0.95
## snap 0.95
## social 0.95
## sole 0.95
## specul 0.95
## stack 0.95
## steel 0.95
## strain 0.95
## subsequ 0.95
## surrend 0.95
## thin 0.95
## thread 0.95
## thus 0.95
## topic 0.95
## upon 0.95
## vampir 0.95
## wealth 0.95
## whole 0.95
## writer 0.95
## younger 0.95
## action 0.94
## actual 0.94
## adventur 0.94
## alberta 0.94
## ask 0.94
## beethoven 0.94
## befor 0.94
## birth 0.94
## bit 0.94
## blanket 0.94
## breach 0.94
## brick 0.94
## bug 0.94
## cater 0.94
## certif 0.94
## chang 0.94
## chilli 0.94
## chocol 0.94
## cinnamon 0.94
## come 0.94
## common 0.94
## conveni 0.94
## cooler 0.94
## counter 0.94
## dawn 0.94
## devot 0.94
## domain 0.94
## environ 0.94
## equal 0.94
## eventu 0.94
## everyth 0.94
## eye 0.94
## forth 0.94
## fundament 0.94
## germani 0.94
## hat 0.94
## immedi 0.94
## infus 0.94
## inner 0.94
## intellig 0.94
## invis 0.94
## israel 0.94
## itll 0.94
## jar 0.94
## journey 0.94
## kept 0.94
## lamb 0.94
## light 0.94
## literari 0.94
## marker 0.94
## mrs 0.94
## much 0.94
## natali 0.94
## need 0.94
## noah 0.94
## novel 0.94
## occasion 0.94
## palac 0.94
## peac 0.94
## perfect 0.94
## pin 0.94
## placement 0.94
## rather 0.94
## realiti 0.94
## reject 0.94
## russia 0.94
## sacrif 0.94
## satisfi 0.94
## scale 0.94
## scar 0.94
## scrub 0.94
## soak 0.94
## soap 0.94
## soda 0.94
## squar 0.94
## stall 0.94
## sticki 0.94
## stylist 0.94
## torment 0.94
## trip 0.94
## truth 0.94
## wander 0.94
## websit 0.94
## wise 0.94
## yell 0.94
## accident 0.93
## after 0.93
## approach 0.93
## approxim 0.93
## asylum 0.93
## background 0.93
## backward 0.93
## backyard 0.93
## bacon 0.93
## band 0.93
## bath 0.93
## below 0.93
## brain 0.93
## breweri 0.93
## brush 0.93
## children 0.93
## clock 0.93
## colleagu 0.93
## comput 0.93
## contain 0.93
## cookbook 0.93
## declar 0.93
## delic 0.93
## desper 0.93
## destroy 0.93
## disappear 0.93
## electron 0.93
## escap 0.93
## essay 0.93
## extra 0.93
## fail 0.93
## fair 0.93
## fifteen 0.93
## forgotten 0.93
## found 0.93
## franci 0.93
## friendship 0.93
## gender 0.93
## ginger 0.93
## grey 0.93
## half 0.93
## harm 0.93
## healthier 0.93
## hed 0.93
## hitler 0.93
## ident 0.93
## immediat 0.93
## informat 0.93
## itch 0.93
## jaw 0.93
## jimmi 0.93
## liber 0.93
## lift 0.93
## list 0.93
## magazin 0.93
## matter 0.93
## mechan 0.93
## medit 0.93
## merchandis 0.93
## musician 0.93
## normal 0.93
## paper 0.93
## particular 0.93
## peak 0.93
## pearl 0.93
## porter 0.93
## possibl 0.93
## potato 0.93
## process 0.93
## profess 0.93
## protein 0.93
## reader 0.93
## realiz 0.93
## rebecca 0.93
## reli 0.93
## remot 0.93
## room 0.93
## sang 0.93
## scienc 0.93
## sew 0.93
## shadow 0.93
## shame 0.93
## similar 0.93
## simul 0.93
## skate 0.93
## slept 0.93
## slow 0.93
## soundtrack 0.93
## sour 0.93
## sprinkl 0.93
## stamp 0.93
## stranger 0.93
## strength 0.93
## string 0.93
## symbol 0.93
## taught 0.93
## testament 0.93
## think 0.93
## thrill 0.93
## transgend 0.93
## wear 0.93
## zero 0.93
## alex 0.92
## anxieti 0.92
## artist 0.92
## barrel 0.92
## batch 0.92
## bead 0.92
## belov 0.92
## betray 0.92
## blur 0.92
## characterist 0.92
## circl 0.92
## clear 0.92
## closet 0.92
## consist 0.92
## copi 0.92
## daniel 0.92
## doctor 0.92
## drum 0.92
## ego 0.92
## fast 0.92
## floor 0.92
## foreign 0.92
## gotten 0.92
## grace 0.92
## grass 0.92
## handsom 0.92
## havent 0.92
## inspir 0.92
## joy 0.92
## leg 0.92
## letter 0.92
## line 0.92
## logic 0.92
## london 0.92
## look 0.92
## luggag 0.92
## mash 0.92
## mean 0.92
## mini 0.92
## modern 0.92
## needl 0.92
## nowher 0.92
## nutrit 0.92
## older 0.92
## onto 0.92
## pocket 0.92
## pressur 0.92
## prophet 0.92
## question 0.92
## realli 0.92
## refer 0.92
## rid 0.92
## scoop 0.92
## sequel 0.92
## set 0.92
## someth 0.92
## spare 0.92
## spinach 0.92
## spread 0.92
## subsid 0.92
## surviv 0.92
## tea 0.92
## tender 0.92
## thought 0.92
## towel 0.92
## trim 0.92
## trust 0.92
## underground 0.92
## virtual 0.92
## academ 0.91
## adopt 0.91
## age 0.91
## amount 0.91
## anyth 0.91
## automat 0.91
## avoid 0.91
## bathroom 0.91
## blame 0.91
## broad 0.91
## canada 0.91
## care 0.91
## collect 0.91
## compound 0.91
## conscious 0.91
## curs 0.91
## diet 0.91
## drawn 0.91
## eas 0.91
## embarrass 0.91
## encourag 0.91
## end 0.91
## enhanc 0.91
## futur 0.91
## gain 0.91
## garden 0.91
## glori 0.91
## grandpa 0.91
## greek 0.91
## heart 0.91
## henri 0.91
## here 0.91
## hunt 0.91
## ireland 0.91
## kitchen 0.91
## laugh 0.91
## lime 0.91
## mention 0.91
## middl 0.91
## monster 0.91
## needless 0.91
## newli 0.91
## note 0.91
## object 0.91
## phase 0.91
## photo 0.91
## pleasant 0.91
## poison 0.91
## popular 0.91
## pour 0.91
## powder 0.91
## pretend 0.91
## roast 0.91
## roman 0.91
## sensit 0.91
## sent 0.91
## shoe 0.91
## smile 0.91
## sort 0.91
## soviet 0.91
## spent 0.91
## thai 0.91
## toe 0.91
## unexpect 0.91
## valid 0.91
## whatev 0.91
## wide 0.91
## wild 0.91
## youd 0.91
## affair 0.90
## burn 0.90
## challeng 0.90
## cook 0.90
## couch 0.90
## dinner 0.90
## discuss 0.90
## england 0.90
## fascin 0.90
## glass 0.90
## grief 0.90
## habit 0.90
## lectur 0.90
## man 0.90
## nod 0.90
## panic 0.90
## precis 0.90
## sheer 0.90
## snack 0.90
## stay 0.90
## step 0.90
## stitch 0.90
## subtl 0.90
## target 0.90
## thing 0.90
## viewer 0.90
## whisper 0.90
## youv 0.90
This shows all the words which have more than 90% corelation the the word 'time'. In plain words, this means that the word time occurs with these words 90% of the time.
# plotting corelations
plot(dtm, terms=findFreqTerms(dtm, lowfreq=1400)[1:12], corThreshold=0.7)
We can also look at the same statistics from the whole dataset without specifying a limit.
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq)
## the said will one like just
## 4772 2862 2598 2469 2272 2167
wf<- data.frame(word=names(freq), freq=freq)
head(wf)
## word freq
## the the 4772
## said said 2862
## will will 2598
## one one 2469
## like like 2272
## just just 2167
Plotting some frequencies
library(ggplot2)
subset(wf, freq>1000) %>%
ggplot(aes(word, freq)) +
geom_bar(stat="identity")+
theme(axis.text.x=element_text(angle=45, hjust=1))
You can also make fancy word clouds
# adding word cloud
library(wordcloud)
set.seed(123)
wordcloud(names(freq), freq, min.freq=700, colors=brewer.pal(6, "Dark2"))
Here we are making the wordcloud for the words which appear atleast 700 times in our sampled dataset.