Data Sampling

Length of files

Twitter

con <- file("/home/HK/Documents/Coursera/Data_science/Capstone/final/Analysis/check/en_US.twitter.txt", "r") 
a <-readLines(con, skipNul = TRUE)  
close(con)  
length(a)
## [1] 2360148

Blogs

con <- file("/home/HK/Documents/Coursera/Data_science/Capstone/final/Analysis/check/en_US.blogs.txt", "r") 
a <-readLines(con, skipNul = TRUE)  
close(con)  
length(a)
## [1] 899288

News

con <- file("/home/HK/Documents/Coursera/Data_science/Capstone/final/Analysis/check/en_US.news.txt", "r") 
a <-readLines(con, skipNul = TRUE)  
close(con)  
length(a)
## [1] 1010242

The main files, being so large, were splitted in to smaller ones. The samples were taken out of these files randomly

con <- file("/home/HK/Documents/Coursera/Data_science/Capstone/final/Analysis/check/en_US.twitter.txt", "r") # replace with your path
a <-readLines(con, skipNul = TRUE)  ## Read first 10000 lines of text
close(con)  ## It's important to close the connection when you are done

# doing 10000 lines at a time. Will result in 236 files. The last 148 or so lines will be omitted
twitter_index<- seq(from = 10000, to = 2360000, by = 10000)

# for first twitter index
tmp<- a[1:twitter_index[1]]
filename<- "twitter_data_0.txt"
write(tmp, file = filename)

length_tmp<- 0
for(i in 1:length(twitter_index)){

    last_i<- twitter_index[i]
    range_start<- (last_i)+1
    range_end<- range_start+9999
    tmp<- a[range_start:range_end]

    #tmp<- a[twitter_index[i-1]+1:twitter_index[i]]

    filename<- paste0("twitter_data_", i,".txt")
    write(tmp, file = filename)
    length_tmp<- c(length_tmp, length(tmp))
    remove(tmp)
}


## For Blogs

con <- file("/home/HK/Documents/Coursera/Data_science/Capstone/final/Analysis/check/en_US.blogs.txt", "r") # replace with your path
a <-readLines(con, skipNul = TRUE)  ## Read first 10000 lines of text
close(con)  ## It's important to close the connection when you are done

blog_index<- seq(from = 8990, to = 899000, by = 8990)

setwd("/home/HK/Documents/Coursera/Data_science/Capstone/final/Intermidiate files/blogs/")

# for first twitter index
tmp<- a[1:blog_index[1]]
filename<- "blog_data_0.txt"
write(tmp, file = filename)

length_tmp<- 0
for(i in 1:length(blog_index)){

  last_i<- blog_index[i]
  range_start<- (last_i)+1
  range_end<- range_start+8990
  tmp<- a[range_start:range_end]

  #tmp<- a[blog_index[i-1]+1:blog_index[i]]

  filename<- paste0("blog_data_", i,".txt")
  write(tmp, file = filename)
  length_tmp<- c(length_tmp, length(tmp))
  remove(tmp)
}



## For News

con <- file("/home/HK/Documents/Coursera/Data_science/Capstone/final/Analysis/check/en_US.news.txt", "r") # replace with your path
a <-readLines(con, skipNul = TRUE)  ## Read first 10000 lines of text
close(con)  ## It's important to close the connection when you are done

news_index<- seq(from = 10000, to = 1010000, by = 10000)

setwd("/home/HK/Documents/Coursera/Data_science/Capstone/final/Intermidiate files/news/")

# for first news index
tmp<- a[1:news_index[1]]
filename<- "news_data_0.txt"
write(tmp, file = filename)

length_tmp<- 0
for(i in 1:length(news_index)){

  last_i<- news_index[i]
  range_start<- (last_i)+1
  range_end<- range_start+9999
  tmp<- a[range_start:range_end]

  #tmp<- a[news_index[i-1]+1:news_index[i]]

  filename<- paste0("news_data_", i,".txt")
  write(tmp, file = filename)
  length_tmp<- c(length_tmp, length(tmp))
  remove(tmp)
}

How we are sampling the files here

set.seed(123)
twitter <- sample(0:236, 1)
blog <- sample(0:102, 1)
news<- sample(0:102, 1)

# Print and then manually store these files in a separate folder
twitter
## [1] 68
blog
## [1] 81
news
## [1] 42

Loading Basic libraries which we are going to use later on.

library(RColorBrewer) 
library(tm)
library(SnowballC)
library(ggplot2)
library(Rgraphviz) # Correlation plots.
library(magrittr)

Loading Data

Loading the sample data. The sampled files above were manually saved in to a separate folder

# setting up file path to the documents we are going to use
cname<- file.path("/home/HK/Documents/Coursera/Data_science/Capstone/final/Intermidiate files/first_sample/")
dir(cname)
## [1] "blog_data_81.txt"    "news_data_42.txt"    "twitter_data_68.txt"
length(dir(cname))
## [1] 3
# Loading documents in a Corpus so that we can continue with analysis
docs<- Corpus(DirSource(cname), readerControl = list(language = "en"))
#inspect(docs)

Some transformations that were done on the data which include removeNumbers, removePunctuation, removeStopwords, removeWhitespace

# Own function for pre processing
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))

docs <- tm_map(docs, toSpace, "_|)|:|;|!|=|#|/|@|\\|")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs,stemDocument, language = "english")
my_stopwords <- c(stopwords('english'), 'fuck', 'nigar', 'slut', 'slutty', 'sluty', 'shit')
docs <- tm_map(docs, removeWords, my_stopwords)
docs <- tm_map(docs, stripWhitespace)
docs<- tm_map(docs, content_transformer(tolower))

After all the transformations have been made, the data has to be written in to a matrix. The following code does the trick.

# Converting the resulting doc into a matrix for further analysis - can be converted to a matrix by using as.matrix()
dtm <- DocumentTermMatrix(docs)
dtm
## <<DocumentTermMatrix (documents: 3, terms: 36469)>>
## Non-/sparse entries: 54046/55361
## Sparsity           : 51%
## Maximal term length: 322
## Weighting          : term frequency (tf)

The above shows us some information about the matrix our data is stored in. Note that it is around 51% sparse(empty)

Removing sparse terms.

dtm <- removeSparseTerms(dtm, sparse=0.3)
dtm
## <<DocumentTermMatrix (documents: 3, terms: 5681)>>
## Non-/sparse entries: 17043/0
## Sparsity           : 0%
## Maximal term length: 13
## Weighting          : term frequency (tf)

The sparsity has improved. This is a much more condensed form.

Exploratory Analysis

Some interesting findings out of the data - these functions can only be performed once the data is in a matrix format.

We can look at which words are the most abundant.

findFreqTerms(dtm, lowfreq=1400)
##  [1] "can"  "day"  "get"  "just" "like" "make" "new"  "one"  "said" "the" 
## [11] "time" "will" "year"

We can also look at what corelations do these words have with others in the dataset.

findAssocs(dtm, "time", corlimit=0.9)
##               time
## abl           1.00
## abraham       1.00
## absolut       1.00
## accept        1.00
## accur         1.00
## advic         1.00
## alien         1.00
## aliv          1.00
## alreadi       1.00
## ancient       1.00
## and           1.00
## angel         1.00
## angl          1.00
## ani           1.00
## anoth         1.00
## anywher       1.00
## applic        1.00
## arab          1.00
## arch          1.00
## arm           1.00
## around        1.00
## attir         1.00
## australia     1.00
## avocado       1.00
## awar          1.00
## away          1.00
## bag           1.00
## bakeri        1.00
## bald          1.00
## bare          1.00
## basi          1.00
## beginn        1.00
## behavior      1.00
## belong        1.00
## besid         1.00
## best          1.00
## bin           1.00
## black         1.00
## blair         1.00
## blood         1.00
## blue          1.00
## boil          1.00
## bolt          1.00
## bomb          1.00
## bone          1.00
## bookstor      1.00
## bottl         1.00
## bought        1.00
## box           1.00
## breed         1.00
## breez         1.00
## bright        1.00
## brock         1.00
## brooklyn      1.00
## bundl         1.00
## cargo         1.00
## castl         1.00
## casual        1.00
## celebr        1.00
## cheaper       1.00
## childhood     1.00
## choic         1.00
## chop          1.00
## chose         1.00
## clean         1.00
## clip          1.00
## closer        1.00
## clue          1.00
## colonel       1.00
## comfort       1.00
## command       1.00
## commenc       1.00
## complain      1.00
## complianc     1.00
## composit      1.00
## condemn       1.00
## confess       1.00
## confus        1.00
## contin        1.00
## costum        1.00
## cough         1.00
## coupl         1.00
## cover         1.00
## cowork        1.00
## cramp         1.00
## cultur        1.00
## cup           1.00
## curat         1.00
## cycl          1.00
## dash          1.00
## dead          1.00
## decor         1.00
## delight       1.00
## depend        1.00
## detach        1.00
## dialogu       1.00
## difficulti    1.00
## digest        1.00
## disappoint    1.00
## disast        1.00
## discov        1.00
## disturb       1.00
## dot           1.00
## drag          1.00
## dri           1.00
## drinker       1.00
## due           1.00
## dutch         1.00
## easier        1.00
## eleven        1.00
## emot          1.00
## enorm         1.00
## enough        1.00
## essenti       1.00
## ethnic        1.00
## even          1.00
## everywher     1.00
## evolut        1.00
## exampl        1.00
## excel         1.00
## exercis       1.00
## experi        1.00
## explain       1.00
## extens        1.00
## fade          1.00
## far           1.00
## fear          1.00
## felt          1.00
## femal         1.00
## fenc          1.00
## fighter       1.00
## fill          1.00
## flex          1.00
## flicker       1.00
## form          1.00
## format        1.00
## forum         1.00
## freedom       1.00
## front         1.00
## full          1.00
## galaxi        1.00
## gentl         1.00
## german        1.00
## gone          1.00
## grandmoth     1.00
## guid          1.00
## hand          1.00
## happen        1.00
## haw           1.00
## head          1.00
## heat          1.00
## heroic        1.00
## heroin        1.00
## historian     1.00
## hooker        1.00
## horizon       1.00
## huge          1.00
## human         1.00
## ignor         1.00
## incompet      1.00
## incred        1.00
## influenti     1.00
## infrastructur 1.00
## inject        1.00
## intak         1.00
## interest      1.00
## internet      1.00
## interpret     1.00
## intuit        1.00
## iron          1.00
## italian       1.00
## item          1.00
## jean          1.00
## jona          1.00
## joshua        1.00
## kind          1.00
## knew          1.00
## kurt          1.00
## leav          1.00
## lesbian       1.00
## lightn        1.00
## llc           1.00
## load          1.00
## long          1.00
## longer        1.00
## lot           1.00
## louis         1.00
## loyalti       1.00
## make          1.00
## male          1.00
## manner        1.00
## marri         1.00
## marvel        1.00
## mass          1.00
## mat           1.00
## max           1.00
## mental        1.00
## mere          1.00
## messi         1.00
## metabol       1.00
## might         1.00
## mint          1.00
## mirror        1.00
## mislead       1.00
## moral         1.00
## more          1.00
## mortal        1.00
## mud           1.00
## music         1.00
## muslim        1.00
## naiv          1.00
## nake          1.00
## nat           1.00
## natur         1.00
## night         1.00
## nois          1.00
## none          1.00
## nose          1.00
## obes          1.00
## odd           1.00
## often         1.00
## once          1.00
## one           1.00
## onli          1.00
## onlin         1.00
## otherwis      1.00
## outlook       1.00
## oven          1.00
## overwhelm     1.00
## packag        1.00
## packet        1.00
## paint         1.00
## paperwork     1.00
## parallel      1.00
## parti         1.00
## peggi         1.00
## phrase        1.00
## piec          1.00
## pine          1.00
## plenti        1.00
## plot          1.00
## poland        1.00
## polar         1.00
## politician    1.00
## porch         1.00
## pregnanc      1.00
## pregnant      1.00
## preview       1.00
## prey          1.00
## princ         1.00
## principl      1.00
## prix          1.00
## prize         1.00
## protocol      1.00
## punch         1.00
## purpos        1.00
## quiz          1.00
## rant          1.00
## rape          1.00
## react         1.00
## reason        1.00
## reconstruct   1.00
## refere        1.00
## reflect       1.00
## refresh       1.00
## regim         1.00
## reinforc      1.00
## relationship  1.00
## remark        1.00
## remov         1.00
## resembl       1.00
## rest          1.00
## review        1.00
## revolut       1.00
## ride          1.00
## root          1.00
## row           1.00
## runaway       1.00
## sam           1.00
## sampl         1.00
## samuel        1.00
## satisfact     1.00
## scan          1.00
## search        1.00
## sens          1.00
## shape         1.00
## share         1.00
## she           1.00
## shed          1.00
## sheet         1.00
## shield        1.00
## shock         1.00
## short         1.00
## shorter       1.00
## shoulder      1.00
## shown         1.00
## shutdown      1.00
## silenc        1.00
## simplifi      1.00
## sink          1.00
## sister        1.00
## skill         1.00
## skirt         1.00
## slip          1.00
## soil          1.00
## somehow       1.00
## somewhat      1.00
## sonic         1.00
## soup          1.00
## specif        1.00
## spectacular   1.00
## spoil         1.00
## spous         1.00
## spray         1.00
## squeez        1.00
## stalk         1.00
## stanc         1.00
## steer         1.00
## sticker       1.00
## storag        1.00
## strang        1.00
## strawberri    1.00
## strip         1.00
## submiss       1.00
## sugar         1.00
## suitcas       1.00
## sweat         1.00
## sympathi      1.00
## tabl          1.00
## take          1.00
## talent        1.00
## technic       1.00
## teenag        1.00
## ten           1.00
## tend          1.00
## tension       1.00
## tent          1.00
## their         1.00
## theme         1.00
## themselv      1.00
## these         1.00
## thigh         1.00
## through       1.00
## tight         1.00
## toddler       1.00
## total         1.00
## trademark     1.00
## tragedi       1.00
## transform     1.00
## transmiss     1.00
## treasur       1.00
## trick         1.00
## twist         1.00
## unless        1.00
## unravel       1.00
## upper         1.00
## use           1.00
## usher         1.00
## voic          1.00
## volum         1.00
## wasnt         1.00
## way           1.00
## web           1.00
## wed           1.00
## weight        1.00
## wheat         1.00
## wherea        1.00
## wind          1.00
## wisdom        1.00
## wizard        1.00
## world         1.00
## worn          1.00
## worri         1.00
## wors          1.00
## worship       1.00
## wrap          1.00
## wrinkl        1.00
## written       1.00
## abil          0.99
## abov          0.99
## absurd        0.99
## achiev        0.99
## african       0.99
## alon          0.99
## annoy         0.99
## aris          0.99
## assassin      0.99
## aunt          0.99
## bake          0.99
## barbecu       0.99
## barrier       0.99
## basic         0.99
## belief        0.99
## beneath       0.99
## bite          0.99
## blade         0.99
## blew          0.99
## boot          0.99
## brand         0.99
## british       0.99
## broke         0.99
## broken        0.99
## brutal        0.99
## bulk          0.99
## bunni         0.99
## burst         0.99
## canadian      0.99
## charm         0.99
## chemic        0.99
## chest         0.99
## child         0.99
## christian     0.99
## coaster       0.99
## coat          0.99
## compil        0.99
## complet       0.99
## compos        0.99
## constant      0.99
## corn          0.99
## cotton        0.99
## cousin        0.99
## crack         0.99
## craft         0.99
## creat         0.99
## creation      0.99
## dark          0.99
## decid         0.99
## dedic         0.99
## differ        0.99
## dip           0.99
## doe           0.99
## doesnt        0.99
## door          0.99
## doubt         0.99
## dough         0.99
## downward      0.99
## each          0.99
## easili        0.99
## either        0.99
## email         0.99
## embrac        0.99
## emili         0.99
## emma          0.99
## empti         0.99
## enemi         0.99
## english       0.99
## enthusiast    0.99
## entri         0.99
## equival       0.99
## everi         0.99
## except        0.99
## experienc     0.99
## fact          0.99
## familiar      0.99
## father        0.99
## film          0.99
## fit           0.99
## flame         0.99
## fold          0.99
## formula       0.99
## frame         0.99
## from          0.99
## frustrat      0.99
## fuss          0.99
## garag         0.99
## genr          0.99
## give          0.99
## goe           0.99
## golden        0.99
## googl         0.99
## grave         0.99
## green         0.99
## grow          0.99
## guilt         0.99
## hard          0.99
## harvest       0.99
## haunt         0.99
## hole          0.99
## holiday       0.99
## horror        0.99
## hour          0.99
## household     0.99
## howev         0.99
## humbl         0.99
## humor         0.99
## husband       0.99
## icon          0.99
## individu      0.99
## inevit        0.99
## instruct      0.99
## interact      0.99
## introduc      0.99
## introduct     0.99
## isnt          0.99
## jacket        0.99
## jane          0.99
## japanes       0.99
## joke          0.99
## justifi       0.99
## keep          0.99
## knife         0.99
## laid          0.99
## lemon         0.99
## lie           0.99
## liquid        0.99
## literatur     0.99
## live          0.99
## log           0.99
## lone          0.99
## magic         0.99
## mail          0.99
## malcolm       0.99
## match         0.99
## meant         0.99
## medium        0.99
## method        0.99
## mexican       0.99
## mindset       0.99
## model         0.99
## most          0.99
## movement      0.99
## must          0.99
## neither       0.99
## noodl         0.99
## notic         0.99
## obvious       0.99
## occas         0.99
## onc           0.99
## order         0.99
## ordinari      0.99
## our           0.99
## path          0.99
## peg           0.99
## pepper        0.99
## perhap        0.99
## philosophi    0.99
## photograph    0.99
## piano         0.99
## place         0.99
## plastic       0.99
## playground    0.99
## portray       0.99
## precious      0.99
## pride         0.99
## proport       0.99
## pull          0.99
## pure          0.99
## put           0.99
## quiet         0.99
## rack          0.99
## rare          0.99
## regard        0.99
## religion      0.99
## resist        0.99
## resolut       0.99
## retreat       0.99
## reveal        0.99
## ridicul       0.99
## right         0.99
## salt          0.99
## satellit      0.99
## scrap         0.99
## scrape        0.99
## seen          0.99
## seller        0.99
## seren         0.99
## serious       0.99
## shrug         0.99
## silent        0.99
## slight        0.99
## snow          0.99
## societi       0.99
## solid         0.99
## sorrow        0.99
## sparkl        0.99
## speak         0.99
## spear         0.99
## spici         0.99
## spiral        0.99
## squash        0.99
## steam         0.99
## stone         0.99
## stripe        0.99
## studio        0.99
## stunt         0.99
## style         0.99
## subject       0.99
## succeed       0.99
## suppos        0.99
## swim          0.99
## tap           0.99
## tape          0.99
## tast          0.99
## tear          0.99
## techniqu      0.99
## then          0.99
## tini          0.99
## titan         0.99
## togeth        0.99
## tone          0.99
## tongu         0.99
## toy           0.99
## tshirt        0.99
## tube          0.99
## type          0.99
## uncomfort     0.99
## unfortun      0.99
## unknown       0.99
## until         0.99
## urgent        0.99
## vagu          0.99
## valuabl       0.99
## vanish        0.99
## vintag        0.99
## walk          0.99
## want          0.99
## weak          0.99
## window        0.99
## worst         0.99
## worth         0.99
## wouldnt       0.99
## yellow        0.99
## absent        0.98
## act           0.98
## adapt         0.98
## add           0.98
## admit         0.98
## adult         0.98
## africa        0.98
## ala           0.98
## album         0.98
## alicia        0.98
## all           0.98
## armstrong     0.98
## arrow         0.98
## back          0.98
## batteri       0.98
## becom         0.98
## begin         0.98
## believ        0.98
## bell          0.98
## bias          0.98
## bigger        0.98
## blond         0.98
## blow          0.98
## bow           0.98
## bread         0.98
## brother       0.98
## bump          0.98
## bust          0.98
## calendar      0.98
## capabl        0.98
## captain       0.98
## carv          0.98
## caviar        0.98
## cheap         0.98
## chicken       0.98
## chosen        0.98
## circul        0.98
## collater      0.98
## comment       0.98
## compel        0.98
## concept       0.98
## confidenti    0.98
## correct       0.98
## cottag        0.98
## cours         0.98
## creativ       0.98
## cricket       0.98
## crown         0.98
## curious       0.98
## dare          0.98
## defin         0.98
## descript      0.98
## destini       0.98
## digit         0.98
## discoveri     0.98
## drain         0.98
## drama         0.98
## draw          0.98
## ear           0.98
## easi          0.98
## erik          0.98
## evok          0.98
## experiment    0.98
## explod        0.98
## faith         0.98
## fall          0.98
## fate          0.98
## fault         0.98
## feed          0.98
## figur         0.98
## final         0.98
## find          0.98
## fireplac      0.98
## flash         0.98
## flesh         0.98
## flower        0.98
## food          0.98
## fragment      0.98
## fresh         0.98
## funki         0.98
## fuzzi         0.98
## garlic        0.98
## gem           0.98
## gospel        0.98
## gps           0.98
## grapefruit    0.98
## grate         0.98
## gratitud      0.98
## graviti       0.98
## groceri       0.98
## guin          0.98
## haiti         0.98
## healthi       0.98
## her           0.98
## hgtv          0.98
## hide          0.98
## ian           0.98
## illustr       0.98
## impress       0.98
## include       0.98
## incorpor      0.98
## india         0.98
## insignific    0.98
## instinct      0.98
## insult        0.98
## insurg        0.98
## intact        0.98
## invent        0.98
## invit         0.98
## juic          0.98
## julian        0.98
## jungl         0.98
## kati          0.98
## keyboard      0.98
## languag       0.98
## leaf          0.98
## lifestyl      0.98
## lipstick      0.98
## liu           0.98
## loos          0.98
## luke          0.98
## magnifi       0.98
## mainstream    0.98
## makeov        0.98
## mansion       0.98
## mask          0.98
## matur         0.98
## meaning       0.98
## microsoft     0.98
## miner         0.98
## miseri        0.98
## mix           0.98
## movi          0.98
## narrat        0.98
## narrow        0.98
## necessarili   0.98
## noisi         0.98
## opinion       0.98
## origin        0.98
## outdat        0.98
## overlook      0.98
## pair          0.98
## pasta         0.98
## peninsula     0.98
## peopl         0.98
## permiss       0.98
## pile          0.98
## polish        0.98
## pop           0.98
## preacher      0.98
## prefer        0.98
## preschool     0.98
## privileg      0.98
## progress      0.98
## pub           0.98
## queen         0.98
## radic         0.98
## rage          0.98
## republ        0.98
## resum         0.98
## revel         0.98
## revisit       0.98
## ring          0.98
## rocket        0.98
## salli         0.98
## salon         0.98
## saw           0.98
## scene         0.98
## scholar       0.98
## scream        0.98
## seem          0.98
## selfish       0.98
## shell         0.98
## shes          0.98
## shini         0.98
## shooter       0.98
## shop          0.98
## shower        0.98
## side          0.98
## simpl         0.98
## sip           0.98
## sit           0.98
## sky           0.98
## smoke         0.98
## soft          0.98
## solut         0.98
## spirit        0.98
## spit          0.98
## stain         0.98
## stapl         0.98
## start         0.98
## steward       0.98
## stick         0.98
## stori         0.98
## strand        0.98
## straw         0.98
## sudden        0.98
## sun           0.98
## sustain       0.98
## swing         0.98
## tale          0.98
## task          0.98
## tempt         0.98
## teresa        0.98
## there         0.98
## this          0.98
## thorough      0.98
## though        0.98
## toilet        0.98
## toler         0.98
## tomato        0.98
## ton           0.98
## tori          0.98
## tourist       0.98
## toward        0.98
## trace         0.98
## treat         0.98
## tree          0.98
## tri           0.98
## tuck          0.98
## turn          0.98
## twilight      0.98
## unaccept      0.98
## uncle         0.98
## understand    0.98
## underwear     0.98
## unfinish      0.98
## usualli       0.98
## veri          0.98
## vinyl         0.98
## visit         0.98
## vocabulari    0.98
## walnut        0.98
## warm          0.98
## warmer        0.98
## week          0.98
## well          0.98
## went          0.98
## wheel         0.98
## which         0.98
## whistl        0.98
## wikipedia     0.98
## woman         0.98
## wont          0.98
## yet           0.98
## advertis      0.97
## aisl          0.97
## ale           0.97
## almond        0.97
## although      0.97
## alway         0.97
## angri         0.97
## antiqu        0.97
## apolog        0.97
## appli         0.97
## becaus        0.97
## bottom        0.97
## brew          0.97
## butter        0.97
## buy           0.97
## calm          0.97
## can           0.97
## card          0.97
## carrot        0.97
## caus          0.97
## certain       0.97
## chain         0.97
## charact       0.97
## chariti       0.97
## chart         0.97
## choos         0.97
## cloth         0.97
## comparison    0.97
## content       0.97
## convers       0.97
## corrupt       0.97
## couldnt       0.97
## creami        0.97
## cross         0.97
## daili         0.97
## darker        0.97
## decent        0.97
## deep          0.97
## depress       0.97
## design        0.97
## desir         0.97
## die           0.97
## difficult     0.97
## disrupt       0.97
## distract      0.97
## dragon        0.97
## dust          0.97
## egg           0.97
## exclus        0.97
## fals          0.97
## fashion       0.97
## fiction       0.97
## flat          0.97
## fortun        0.97
## galleri       0.97
## gather        0.97
## generous      0.97
## genuin        0.97
## ghost         0.97
## given         0.97
## grandfath     0.97
## greater       0.97
## guitar        0.97
## hadnt         0.97
## hannah        0.97
## heavi         0.97
## idea          0.97
## innoc         0.97
## insert        0.97
## insight       0.97
## kid           0.97
## kindl         0.97
## label         0.97
## latter        0.97
## lay           0.97
## lean          0.97
## learn         0.97
## loom          0.97
## luxuri        0.97
## mani          0.97
## master        0.97
## memori        0.97
## moment        0.97
## moreov        0.97
## mother        0.97
## mouth         0.97
## mysteri       0.97
## name          0.97
## navig         0.97
## negat         0.97
## outlin        0.97
## pale          0.97
## parent        0.97
## pari          0.97
## percept       0.97
## philippin     0.97
## pirat         0.97
## proper        0.97
## psycholog     0.97
## quest         0.97
## quick         0.97
## quit          0.97
## ration        0.97
## romant        0.97
## rub           0.97
## rubber        0.97
## scientist     0.97
## scotland      0.97
## sea           0.97
## settl         0.97
## simpli        0.97
## sketch        0.97
## skin          0.97
## small         0.97
## smooth        0.97
## stir          0.97
## stress        0.97
## suggest       0.97
## taken         0.97
## teach         0.97
## theyd         0.97
## translat      0.97
## uniqu         0.97
## user          0.97
## usual         0.97
## vegetarian    0.97
## version       0.97
## view          0.97
## war           0.97
## wave          0.97
## when          0.97
## whimsic       0.97
## wore          0.97
## work          0.97
## acclaim       0.96
## aka           0.96
## albeit        0.96
## alec          0.96
## alice         0.96
## alli          0.96
## along         0.96
## ami           0.96
## amus          0.96
## anniversari   0.96
## answer        0.96
## aspect        0.96
## assert        0.96
## bedroom       0.96
## behalf        0.96
## belli         0.96
## better        0.96
## bound         0.96
## boy           0.96
## came          0.96
## chees         0.96
## chronic       0.96
## church        0.96
## citrus        0.96
## collar        0.96
## communic      0.96
## courtesi      0.96
## cow           0.96
## crafti        0.96
## crush         0.96
## daughter      0.96
## desert        0.96
## didnt         0.96
## direct        0.96
## distanc       0.96
## diva          0.96
## doll          0.96
## done          0.96
## drank         0.96
## dread         0.96
## drown         0.96
## edg           0.96
## entir         0.96
## especi        0.96
## exceed        0.96
## exclaim       0.96
## fanci         0.96
## fierc         0.96
## flick         0.96
## flow          0.96
## fog           0.96
## fond          0.96
## forgiv        0.96
## gabriel       0.96
## gate          0.96
## genius        0.96
## girlfriend    0.96
## grasp         0.96
## halfway       0.96
## hatch         0.96
## heal          0.96
## heel          0.96
## helen         0.96
## hollow        0.96
## hors          0.96
## hung          0.96
## ice           0.96
## imag          0.96
## imposs        0.96
## instant       0.96
## instead       0.96
## kingdom       0.96
## laptop        0.96
## lefti         0.96
## librari       0.96
## lisa          0.96
## littl         0.96
## livestock     0.96
## loyal         0.96
## luci          0.96
## lunch         0.96
## made          0.96
## margaret      0.96
## mediocr       0.96
## melt          0.96
## mileag        0.96
## nazi          0.96
## neat          0.96
## necessari     0.96
## old           0.96
## operat        0.96
## overlap       0.96
## pace          0.96
## pain          0.96
## pakistani     0.96
## palat         0.96
## patch         0.96
## person        0.96
## present       0.96
## priest        0.96
## probabl       0.96
## proclaim      0.96
## publish       0.96
## reassur       0.96
## reminisc      0.96
## remodel       0.96
## rich          0.96
## roof          0.96
## sake          0.96
## sauc          0.96
## sausag        0.96
## sentiment     0.96
## separ         0.96
## serial        0.96
## sinc          0.96
## size          0.96
## skull         0.96
## space         0.96
## spoken        0.96
## startl        0.96
## still         0.96
## stop          0.96
## store         0.96
## struggl       0.96
## sunset        0.96
## surpris       0.96
## tall          0.96
## templ         0.96
## thick         0.96
## tortur        0.96
## touch         0.96
## trader        0.96
## tragic        0.96
## undertak      0.96
## unhappi       0.96
## upcom         0.96
## updat         0.96
## veggi         0.96
## victoria      0.96
## violin        0.96
## visual        0.96
## wast          0.96
## water         0.96
## within        0.96
## abund         0.95
## actor         0.95
## almost        0.95
## altern        0.95
## array         0.95
## attitud       0.95
## autumn        0.95
## bark          0.95
## bodi          0.95
## bold          0.95
## boundari      0.95
## cake          0.95
## camera        0.95
## caramel       0.95
## carri         0.95
## ceas          0.95
## chase         0.95
## color         0.95
## core          0.95
## corner        0.95
## count         0.95
## demon         0.95
## detail        0.95
## dog           0.95
## ebook         0.95
## effect        0.95
## empire        0.95
## engag         0.95
## exist         0.95
## fluke         0.95
## for           0.95
## french        0.95
## fulli         0.95
## grill         0.95
## hamburg       0.95
## harder        0.95
## hold          0.95
## imagin        0.95
## impli         0.95
## import        0.95
## influenc      0.95
## intens        0.95
## islamic       0.95
## jew           0.95
## knowledg      0.95
## less          0.95
## lifetim       0.95
## lock          0.95
## main          0.95
## massiv        0.95
## meal          0.95
## mice          0.95
## minist        0.95
## minut         0.95
## mission       0.95
## mood          0.95
## network       0.95
## nowaday       0.95
## pack          0.95
## patienc       0.95
## pattern       0.95
## physic        0.95
## poor          0.95
## prepar        0.95
## print         0.95
## purchas       0.95
## recip         0.95
## relev         0.95
## respons       0.95
## rhythm        0.95
## roll          0.95
## rug           0.95
## sacr          0.95
## sacrific      0.95
## sand          0.95
## sat           0.95
## shade         0.95
## shake         0.95
## shi           0.95
## slim          0.95
## snap          0.95
## social        0.95
## sole          0.95
## specul        0.95
## stack         0.95
## steel         0.95
## strain        0.95
## subsequ       0.95
## surrend       0.95
## thin          0.95
## thread        0.95
## thus          0.95
## topic         0.95
## upon          0.95
## vampir        0.95
## wealth        0.95
## whole         0.95
## writer        0.95
## younger       0.95
## action        0.94
## actual        0.94
## adventur      0.94
## alberta       0.94
## ask           0.94
## beethoven     0.94
## befor         0.94
## birth         0.94
## bit           0.94
## blanket       0.94
## breach        0.94
## brick         0.94
## bug           0.94
## cater         0.94
## certif        0.94
## chang         0.94
## chilli        0.94
## chocol        0.94
## cinnamon      0.94
## come          0.94
## common        0.94
## conveni       0.94
## cooler        0.94
## counter       0.94
## dawn          0.94
## devot         0.94
## domain        0.94
## environ       0.94
## equal         0.94
## eventu        0.94
## everyth       0.94
## eye           0.94
## forth         0.94
## fundament     0.94
## germani       0.94
## hat           0.94
## immedi        0.94
## infus         0.94
## inner         0.94
## intellig      0.94
## invis         0.94
## israel        0.94
## itll          0.94
## jar           0.94
## journey       0.94
## kept          0.94
## lamb          0.94
## light         0.94
## literari      0.94
## marker        0.94
## mrs           0.94
## much          0.94
## natali        0.94
## need          0.94
## noah          0.94
## novel         0.94
## occasion      0.94
## palac         0.94
## peac          0.94
## perfect       0.94
## pin           0.94
## placement     0.94
## rather        0.94
## realiti       0.94
## reject        0.94
## russia        0.94
## sacrif        0.94
## satisfi       0.94
## scale         0.94
## scar          0.94
## scrub         0.94
## soak          0.94
## soap          0.94
## soda          0.94
## squar         0.94
## stall         0.94
## sticki        0.94
## stylist       0.94
## torment       0.94
## trip          0.94
## truth         0.94
## wander        0.94
## websit        0.94
## wise          0.94
## yell          0.94
## accident      0.93
## after         0.93
## approach      0.93
## approxim      0.93
## asylum        0.93
## background    0.93
## backward      0.93
## backyard      0.93
## bacon         0.93
## band          0.93
## bath          0.93
## below         0.93
## brain         0.93
## breweri       0.93
## brush         0.93
## children      0.93
## clock         0.93
## colleagu      0.93
## comput        0.93
## contain       0.93
## cookbook      0.93
## declar        0.93
## delic         0.93
## desper        0.93
## destroy       0.93
## disappear     0.93
## electron      0.93
## escap         0.93
## essay         0.93
## extra         0.93
## fail          0.93
## fair          0.93
## fifteen       0.93
## forgotten     0.93
## found         0.93
## franci        0.93
## friendship    0.93
## gender        0.93
## ginger        0.93
## grey          0.93
## half          0.93
## harm          0.93
## healthier     0.93
## hed           0.93
## hitler        0.93
## ident         0.93
## immediat      0.93
## informat      0.93
## itch          0.93
## jaw           0.93
## jimmi         0.93
## liber         0.93
## lift          0.93
## list          0.93
## magazin       0.93
## matter        0.93
## mechan        0.93
## medit         0.93
## merchandis    0.93
## musician      0.93
## normal        0.93
## paper         0.93
## particular    0.93
## peak          0.93
## pearl         0.93
## porter        0.93
## possibl       0.93
## potato        0.93
## process       0.93
## profess       0.93
## protein       0.93
## reader        0.93
## realiz        0.93
## rebecca       0.93
## reli          0.93
## remot         0.93
## room          0.93
## sang          0.93
## scienc        0.93
## sew           0.93
## shadow        0.93
## shame         0.93
## similar       0.93
## simul         0.93
## skate         0.93
## slept         0.93
## slow          0.93
## soundtrack    0.93
## sour          0.93
## sprinkl       0.93
## stamp         0.93
## stranger      0.93
## strength      0.93
## string        0.93
## symbol        0.93
## taught        0.93
## testament     0.93
## think         0.93
## thrill        0.93
## transgend     0.93
## wear          0.93
## zero          0.93
## alex          0.92
## anxieti       0.92
## artist        0.92
## barrel        0.92
## batch         0.92
## bead          0.92
## belov         0.92
## betray        0.92
## blur          0.92
## characterist  0.92
## circl         0.92
## clear         0.92
## closet        0.92
## consist       0.92
## copi          0.92
## daniel        0.92
## doctor        0.92
## drum          0.92
## ego           0.92
## fast          0.92
## floor         0.92
## foreign       0.92
## gotten        0.92
## grace         0.92
## grass         0.92
## handsom       0.92
## havent        0.92
## inspir        0.92
## joy           0.92
## leg           0.92
## letter        0.92
## line          0.92
## logic         0.92
## london        0.92
## look          0.92
## luggag        0.92
## mash          0.92
## mean          0.92
## mini          0.92
## modern        0.92
## needl         0.92
## nowher        0.92
## nutrit        0.92
## older         0.92
## onto          0.92
## pocket        0.92
## pressur       0.92
## prophet       0.92
## question      0.92
## realli        0.92
## refer         0.92
## rid           0.92
## scoop         0.92
## sequel        0.92
## set           0.92
## someth        0.92
## spare         0.92
## spinach       0.92
## spread        0.92
## subsid        0.92
## surviv        0.92
## tea           0.92
## tender        0.92
## thought       0.92
## towel         0.92
## trim          0.92
## trust         0.92
## underground   0.92
## virtual       0.92
## academ        0.91
## adopt         0.91
## age           0.91
## amount        0.91
## anyth         0.91
## automat       0.91
## avoid         0.91
## bathroom      0.91
## blame         0.91
## broad         0.91
## canada        0.91
## care          0.91
## collect       0.91
## compound      0.91
## conscious     0.91
## curs          0.91
## diet          0.91
## drawn         0.91
## eas           0.91
## embarrass     0.91
## encourag      0.91
## end           0.91
## enhanc        0.91
## futur         0.91
## gain          0.91
## garden        0.91
## glori         0.91
## grandpa       0.91
## greek         0.91
## heart         0.91
## henri         0.91
## here          0.91
## hunt          0.91
## ireland       0.91
## kitchen       0.91
## laugh         0.91
## lime          0.91
## mention       0.91
## middl         0.91
## monster       0.91
## needless      0.91
## newli         0.91
## note          0.91
## object        0.91
## phase         0.91
## photo         0.91
## pleasant      0.91
## poison        0.91
## popular       0.91
## pour          0.91
## powder        0.91
## pretend       0.91
## roast         0.91
## roman         0.91
## sensit        0.91
## sent          0.91
## shoe          0.91
## smile         0.91
## sort          0.91
## soviet        0.91
## spent         0.91
## thai          0.91
## toe           0.91
## unexpect      0.91
## valid         0.91
## whatev        0.91
## wide          0.91
## wild          0.91
## youd          0.91
## affair        0.90
## burn          0.90
## challeng      0.90
## cook          0.90
## couch         0.90
## dinner        0.90
## discuss       0.90
## england       0.90
## fascin        0.90
## glass         0.90
## grief         0.90
## habit         0.90
## lectur        0.90
## man           0.90
## nod           0.90
## panic         0.90
## precis        0.90
## sheer         0.90
## snack         0.90
## stay          0.90
## step          0.90
## stitch        0.90
## subtl         0.90
## target        0.90
## thing         0.90
## viewer        0.90
## whisper       0.90
## youv          0.90

This shows all the words which have more than 90% corelation the the word 'time'. In plain words, this means that the word time occurs with these words 90% of the time.

# plotting corelations
plot(dtm, terms=findFreqTerms(dtm, lowfreq=1400)[1:12], corThreshold=0.7)

plot of chunk unnamed-chunk-13

We can also look at the same statistics from the whole dataset without specifying a limit.

freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq)
##  the said will  one like just 
## 4772 2862 2598 2469 2272 2167
wf<- data.frame(word=names(freq), freq=freq)
head(wf)
##      word freq
## the   the 4772
## said said 2862
## will will 2598
## one   one 2469
## like like 2272
## just just 2167

Plotting some frequencies

library(ggplot2)
subset(wf, freq>1000) %>% 
  ggplot(aes(word, freq)) +
  geom_bar(stat="identity")+
  theme(axis.text.x=element_text(angle=45, hjust=1))

plot of chunk unnamed-chunk-15

You can also make fancy word clouds

# adding word cloud
library(wordcloud)
set.seed(123)
wordcloud(names(freq), freq, min.freq=700, colors=brewer.pal(6, "Dark2"))

plot of chunk unnamed-chunk-16

Here we are making the wordcloud for the words which appear atleast 700 times in our sampled dataset.

Future Work

  1. Predictions of different samples of the dataset files with the possiblity of stacking up of models
  2. Creation of a shiny app with a user friendly layout.